libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264.h"
  32 #include "h264data.h"
  33 #include "h264_parser.h"
  34 #include "golomb.h"
  35 #include "rectangle.h"
  36
  37 #include "cabac.h"
  38 #ifdef ARCH_X86
  39 #include "x86/h264_i386.h"
  40 #endif
  41
  42 //#undef NDEBUG
  43 #include <assert.h>
  44
  45 /**
  46  * Value of Picture.reference when Picture is not a reference picture, but
  47  * is held for delayed output.
  48  */
  49 #define DELAYED_PIC_REF 4
  50
  51 static VLC coeff_token_vlc[4];
  52 static VLC_TYPE coeff_token_vlc_tables[520+332+280+256][2];
  53 static const int coeff_token_vlc_tables_size[4]={520,332,280,256};
  54
  55 static VLC chroma_dc_coeff_token_vlc;
  56 static VLC_TYPE chroma_dc_coeff_token_vlc_table[256][2];
  57 static const int chroma_dc_coeff_token_vlc_table_size = 256;
  58
  59 static VLC total_zeros_vlc[15];
  60 static VLC_TYPE total_zeros_vlc_tables[15][512][2];
  61 static const int total_zeros_vlc_tables_size = 512;
  62
  63 static VLC chroma_dc_total_zeros_vlc[3];
  64 static VLC_TYPE chroma_dc_total_zeros_vlc_tables[3][8][2];
  65 static const int chroma_dc_total_zeros_vlc_tables_size = 8;
  66
  67 static VLC run_vlc[6];
  68 static VLC_TYPE run_vlc_tables[6][8][2];
  69 static const int run_vlc_tables_size = 8;
  70
  71 static VLC run7_vlc;
  72 static VLC_TYPE run7_vlc_table[96][2];
  73 static const int run7_vlc_table_size = 96;
  74
  75 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  76 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  77 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  78 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  79 static Picture * remove_long(H264Context *h, int i, int ref_mask);
  80
  81 static av_always_inline uint32_t pack16to32(int a, int b){
  82 #ifdef WORDS_BIGENDIAN
  83    return (b&0xFFFF) + (a<<16);
  84 #else
  85    return (a&0xFFFF) + (b<<16);
  86 #endif
  87 }
  88
  89 static const uint8_t rem6[52]={
  90 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  91 };
  92
  93 static const uint8_t div6[52]={
  94 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  95 };
  96
  97 static const int left_block_options[4][8]={
  98     {0,1,2,3,7,10,8,11},
  99     {2,2,3,3,8,11,8,11},
 100     {0,0,1,1,7,10,7,10},
 101     {0,2,0,2,7,10,7,10}
 102 };
 103
 104 #define LEVEL_TAB_BITS 8
 105 static int8_t cavlc_level_tab[7][1<<LEVEL_TAB_BITS][2];
 106
 107 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 108     MpegEncContext * const s = &h->s;
 109     const int mb_xy= h->mb_xy;
 110     int topleft_xy, top_xy, topright_xy, left_xy[2];
 111     int topleft_type, top_type, topright_type, left_type[2];
 112     const int * left_block;
 113     int topleft_partition= -1;
 114     int i;
 115
 116     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
 117
 118     //FIXME deblocking could skip the intra and nnz parts.
 119     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
 120         return;
 121
 122     /* Wow, what a mess, why didn't they simplify the interlacing & intra
 123      * stuff, I can't imagine that these complex rules are worth it. */
 124
 125     topleft_xy = top_xy - 1;
 126     topright_xy= top_xy + 1;
 127     left_xy[1] = left_xy[0] = mb_xy-1;
 128     left_block = left_block_options[0];
 129     if(FRAME_MBAFF){
 130         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 131         const int top_pair_xy      = pair_xy     - s->mb_stride;
 132         const int topleft_pair_xy  = top_pair_xy - 1;
 133         const int topright_pair_xy = top_pair_xy + 1;
 134         const int topleft_mb_field_flag  = IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 135         const int top_mb_field_flag      = IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 136         const int topright_mb_field_flag = IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 137         const int left_mb_field_flag     = IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 138         const int curr_mb_field_flag     = IS_INTERLACED(mb_type);
 139         const int bottom = (s->mb_y & 1);
 140         tprintf(s->avctx, "fill_caches: curr_mb_field_flag:%d, left_mb_field_flag:%d, topleft_mb_field_flag:%d, top_mb_field_flag:%d, topright_mb_field_flag:%d\n", curr_mb_field_flag, left_mb_field_flag, topleft_mb_field_flag, top_mb_field_flag, topright_mb_field_flag);
 141
 142         if (curr_mb_field_flag && (bottom || top_mb_field_flag)){
 143             top_xy -= s->mb_stride;
 144         }
 145         if (curr_mb_field_flag && (bottom || topleft_mb_field_flag)){
 146             topleft_xy -= s->mb_stride;
 147         } else if(bottom && !curr_mb_field_flag && left_mb_field_flag) {
 148             topleft_xy += s->mb_stride;
 149             // take top left mv from the middle of the mb, as opposed to all other modes which use the bottom right partition
 150             topleft_partition = 0;
 151         }
 152         if (curr_mb_field_flag && (bottom || topright_mb_field_flag)){
 153             topright_xy -= s->mb_stride;
 154         }
 155         if (left_mb_field_flag != curr_mb_field_flag) {
 156             left_xy[1] = left_xy[0] = pair_xy - 1;
 157             if (curr_mb_field_flag) {
 158                 left_xy[1] += s->mb_stride;
 159                 left_block = left_block_options[3];
 160             } else {
 161                 left_block= left_block_options[2 - bottom];
 162             }
 163         }
 164     }
 165
 166     h->top_mb_xy = top_xy;
 167     h->left_mb_xy[0] = left_xy[0];
 168     h->left_mb_xy[1] = left_xy[1];
 169     if(for_deblock){
 170         topleft_type = 0;
 171         topright_type = 0;
 172         top_type     = h->slice_table[top_xy     ] < 0xFFFF ? s->current_picture.mb_type[top_xy]     : 0;
 173         left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0;
 174         left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0;
 175
 176         if(MB_MBAFF && !IS_INTRA(mb_type)){
 177             int list;
 178             for(list=0; list<h->list_count; list++){
 179                 //These values where changed for ease of performing MC, we need to change them back
 180                 //FIXME maybe we can make MC and loop filter use the same values or prevent
 181                 //the MC code from changing ref_cache and rather use a temporary array.
 182                 if(USES_LIST(mb_type,list)){
 183                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 184                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 185                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 186                     ref += h->b8_stride;
 187                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 188                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 189                 }
 190             }
 191         }
 192     }else{
 193         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 194         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 195         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 196         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 197         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 198
 199     if(IS_INTRA(mb_type)){
 200         int type_mask= h->pps.constrained_intra_pred ? IS_INTRA(-1) : -1;
 201         h->topleft_samples_available=
 202         h->top_samples_available=
 203         h->left_samples_available= 0xFFFF;
 204         h->topright_samples_available= 0xEEEA;
 205
 206         if(!(top_type & type_mask)){
 207             h->topleft_samples_available= 0xB3FF;
 208             h->top_samples_available= 0x33FF;
 209             h->topright_samples_available= 0x26EA;
 210         }
 211         if(IS_INTERLACED(mb_type) != IS_INTERLACED(left_type[0])){
 212             if(IS_INTERLACED(mb_type)){
 213                 if(!(left_type[0] & type_mask)){
 214                     h->topleft_samples_available&= 0xDFFF;
 215                     h->left_samples_available&= 0x5FFF;
 216                 }
 217                 if(!(left_type[1] & type_mask)){
 218                     h->topleft_samples_available&= 0xFF5F;
 219                     h->left_samples_available&= 0xFF5F;
 220                 }
 221             }else{
 222                 int left_typei = h->slice_table[left_xy[0] + s->mb_stride ] == h->slice_num
 223                                 ? s->current_picture.mb_type[left_xy[0] + s->mb_stride] : 0;
 224                 assert(left_xy[0] == left_xy[1]);
 225                 if(!((left_typei & type_mask) && (left_type[0] & type_mask))){
 226                     h->topleft_samples_available&= 0xDF5F;
 227                     h->left_samples_available&= 0x5F5F;
 228                 }
 229             }
 230         }else{
 231             if(!(left_type[0] & type_mask)){
 232                 h->topleft_samples_available&= 0xDF5F;
 233                 h->left_samples_available&= 0x5F5F;
 234             }
 235         }
 236
 237         if(!(topleft_type & type_mask))
 238             h->topleft_samples_available&= 0x7FFF;
 239
 240         if(!(topright_type & type_mask))
 241             h->topright_samples_available&= 0xFBFF;
 242
 243         if(IS_INTRA4x4(mb_type)){
 244             if(IS_INTRA4x4(top_type)){
 245                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 246                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 247                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 248                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 249             }else{
 250                 int pred;
 251                 if(!(top_type & type_mask))
 252                     pred= -1;
 253                 else{
 254                     pred= 2;
 255                 }
 256                 h->intra4x4_pred_mode_cache[4+8*0]=
 257                 h->intra4x4_pred_mode_cache[5+8*0]=
 258                 h->intra4x4_pred_mode_cache[6+8*0]=
 259                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 260             }
 261             for(i=0; i<2; i++){
 262                 if(IS_INTRA4x4(left_type[i])){
 263                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 264                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 265                 }else{
 266                     int pred;
 267                     if(!(left_type[i] & type_mask))
 268                         pred= -1;
 269                     else{
 270                         pred= 2;
 271                     }
 272                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 273                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 274                 }
 275             }
 276         }
 277     }
 278     }
 279
 280
 281 /*
 282 0 . T T. T T T T
 283 1 L . .L . . . .
 284 2 L . .L . . . .
 285 3 . T TL . . . .
 286 4 L . .L . . . .
 287 5 L . .. . . . .
 288 */
 289 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
 290     if(top_type){
 291         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 292         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 293         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 294         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 295
 296         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 297         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 298
 299         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 300         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 301
 302     }else{
 303         h->non_zero_count_cache[4+8*0]=
 304         h->non_zero_count_cache[5+8*0]=
 305         h->non_zero_count_cache[6+8*0]=
 306         h->non_zero_count_cache[7+8*0]=
 307
 308         h->non_zero_count_cache[1+8*0]=
 309         h->non_zero_count_cache[2+8*0]=
 310
 311         h->non_zero_count_cache[1+8*3]=
 312         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 313
 314     }
 315
 316     for (i=0; i<2; i++) {
 317         if(left_type[i]){
 318             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 319             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 320             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 321             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 322         }else{
 323             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 324             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 325             h->non_zero_count_cache[0+8*1 +   8*i]=
 326             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 327         }
 328     }
 329
 330     if( h->pps.cabac ) {
 331         // top_cbp
 332         if(top_type) {
 333             h->top_cbp = h->cbp_table[top_xy];
 334         } else if(IS_INTRA(mb_type)) {
 335             h->top_cbp = 0x1C0;
 336         } else {
 337             h->top_cbp = 0;
 338         }
 339         // left_cbp
 340         if (left_type[0]) {
 341             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 342         } else if(IS_INTRA(mb_type)) {
 343             h->left_cbp = 0x1C0;
 344         } else {
 345             h->left_cbp = 0;
 346         }
 347         if (left_type[0]) {
 348             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 349         }
 350         if (left_type[1]) {
 351             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 352         }
 353     }
 354
 355 #if 1
 356     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 357         int list;
 358         for(list=0; list<h->list_count; list++){
 359             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 360                 /*if(!h->mv_cache_clean[list]){
 361                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 362                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 363                     h->mv_cache_clean[list]= 1;
 364                 }*/
 365                 continue;
 366             }
 367             h->mv_cache_clean[list]= 0;
 368
 369             if(USES_LIST(top_type, list)){
 370                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 371                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 372                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 373                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 374                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 375                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 376                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 377                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 378                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 379                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 380             }else{
 381                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 382                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 383                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 384                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 385                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 386             }
 387
 388             for(i=0; i<2; i++){
 389                 int cache_idx = scan8[0] - 1 + i*2*8;
 390                 if(USES_LIST(left_type[i], list)){
 391                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 392                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 393                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 394                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 395                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 396                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 397                 }else{
 398                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 399                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 400                     h->ref_cache[list][cache_idx  ]=
 401                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 402                 }
 403             }
 404
 405             if(for_deblock || ((IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred) && !FRAME_MBAFF))
 406                 continue;
 407
 408             if(USES_LIST(topleft_type, list)){
 409                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 410                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 411                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 412                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 413             }else{
 414                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 415                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 416             }
 417
 418             if(USES_LIST(topright_type, list)){
 419                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 420                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 421                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 422                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 423             }else{
 424                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 425                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 426             }
 427
 428             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 429                 continue;
 430
 431             h->ref_cache[list][scan8[5 ]+1] =
 432             h->ref_cache[list][scan8[7 ]+1] =
 433             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 434             h->ref_cache[list][scan8[4 ]] =
 435             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 436             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 437             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 438             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 439             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 440             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 441
 442             if( h->pps.cabac ) {
 443                 /* XXX beurk, Load mvd */
 444                 if(USES_LIST(top_type, list)){
 445                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 446                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 447                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 448                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 449                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 450                 }else{
 451                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 452                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 453                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 454                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 455                 }
 456                 if(USES_LIST(left_type[0], list)){
 457                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 458                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 459                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 460                 }else{
 461                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 462                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 463                 }
 464                 if(USES_LIST(left_type[1], list)){
 465                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 466                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 467                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 468                 }else{
 469                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 470                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 471                 }
 472                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 473                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 474                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 475                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 476                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 477
 478                 if(h->slice_type_nos == FF_B_TYPE){
 479                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 480
 481                     if(IS_DIRECT(top_type)){
 482                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 483                     }else if(IS_8X8(top_type)){
 484                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 485                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 486                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 487                     }else{
 488                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 489                     }
 490
 491                     if(IS_DIRECT(left_type[0]))
 492                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 493                     else if(IS_8X8(left_type[0]))
 494                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 495                     else
 496                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 497
 498                     if(IS_DIRECT(left_type[1]))
 499                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 500                     else if(IS_8X8(left_type[1]))
 501                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 502                     else
 503                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 504                 }
 505             }
 506
 507             if(FRAME_MBAFF){
 508 #define MAP_MVS\
 509                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 510                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 511                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 512                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 513                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 514                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 515                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 516                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 517                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 518                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 519                 if(MB_FIELD){
 520 #define MAP_F2F(idx, mb_type)\
 521                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 522                         h->ref_cache[list][idx] <<= 1;\
 523                         h->mv_cache[list][idx][1] /= 2;\
 524                         h->mvd_cache[list][idx][1] /= 2;\
 525                     }
 526                     MAP_MVS
 527 #undef MAP_F2F
 528                 }else{
 529 #define MAP_F2F(idx, mb_type)\
 530                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 531                         h->ref_cache[list][idx] >>= 1;\
 532                         h->mv_cache[list][idx][1] <<= 1;\
 533                         h->mvd_cache[list][idx][1] <<= 1;\
 534                     }
 535                     MAP_MVS
 536 #undef MAP_F2F
 537                 }
 538             }
 539         }
 540     }
 541 #endif
 542
 543     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 544 }
 545
 546 static inline void write_back_intra_pred_mode(H264Context *h){
 547     const int mb_xy= h->mb_xy;
 548
 549     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 550     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 551     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 552     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 553     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 554     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 555     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 556 }
 557
 558 /**
 559  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 560  */
 561 static inline int check_intra4x4_pred_mode(H264Context *h){
 562     MpegEncContext * const s = &h->s;
 563     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 564     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 565     int i;
 566
 567     if(!(h->top_samples_available&0x8000)){
 568         for(i=0; i<4; i++){
 569             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 570             if(status<0){
 571                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 572                 return -1;
 573             } else if(status){
 574                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 575             }
 576         }
 577     }
 578
 579     if((h->left_samples_available&0x8888)!=0x8888){
 580         static const int mask[4]={0x8000,0x2000,0x80,0x20};
 581         for(i=0; i<4; i++){
 582             if(!(h->left_samples_available&mask[i])){
 583                 int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 584                 if(status<0){
 585                     av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 586                     return -1;
 587                 } else if(status){
 588                     h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 589                 }
 590             }
 591         }
 592     }
 593
 594     return 0;
 595 } //FIXME cleanup like next
 596
 597 /**
 598  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 599  */
 600 static inline int check_intra_pred_mode(H264Context *h, int mode){
 601     MpegEncContext * const s = &h->s;
 602     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 603     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 604
 605     if(mode > 6U) {
 606         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 607         return -1;
 608     }
 609
 610     if(!(h->top_samples_available&0x8000)){
 611         mode= top[ mode ];
 612         if(mode<0){
 613             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 614             return -1;
 615         }
 616     }
 617
 618     if((h->left_samples_available&0x8080) != 0x8080){
 619         mode= left[ mode ];
 620         if(h->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred
 621             mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(h->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8);
 622         }
 623         if(mode<0){
 624             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 625             return -1;
 626         }
 627     }
 628
 629     return mode;
 630 }
 631
 632 /**
 633  * gets the predicted intra4x4 prediction mode.
 634  */
 635 static inline int pred_intra_mode(H264Context *h, int n){
 636     const int index8= scan8[n];
 637     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 638     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 639     const int min= FFMIN(left, top);
 640
 641     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 642
 643     if(min<0) return DC_PRED;
 644     else      return min;
 645 }
 646
 647 static inline void write_back_non_zero_count(H264Context *h){
 648     const int mb_xy= h->mb_xy;
 649
 650     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 651     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 652     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 653     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 654     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 655     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 656     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 657
 658     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 659     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 660     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 661
 662     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 663     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 664     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 665 }
 666
 667 /**
 668  * gets the predicted number of non-zero coefficients.
 669  * @param n block index
 670  */
 671 static inline int pred_non_zero_count(H264Context *h, int n){
 672     const int index8= scan8[n];
 673     const int left= h->non_zero_count_cache[index8 - 1];
 674     const int top = h->non_zero_count_cache[index8 - 8];
 675     int i= left + top;
 676
 677     if(i<64) i= (i+1)>>1;
 678
 679     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 680
 681     return i&31;
 682 }
 683
 684 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 685     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 686     MpegEncContext *s = &h->s;
 687
 688     /* there is no consistent mapping of mvs to neighboring locations that will
 689      * make mbaff happy, so we can't move all this logic to fill_caches */
 690     if(FRAME_MBAFF){
 691         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 692         const int16_t *mv;
 693         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 694         *C = h->mv_cache[list][scan8[0]-2];
 695
 696         if(!MB_FIELD
 697            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 698             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 699             if(IS_INTERLACED(mb_types[topright_xy])){
 700 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 701                 const int x4 = X4, y4 = Y4;\
 702                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 703                 if(!USES_LIST(mb_type,list))\
 704                     return LIST_NOT_USED;\
 705                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 706                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 707                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 708                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 709
 710                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 711             }
 712         }
 713         if(topright_ref == PART_NOT_AVAILABLE
 714            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 715            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 716             if(!MB_FIELD
 717                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 718                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 719             }
 720             if(MB_FIELD
 721                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 722                && i >= scan8[0]+8){
 723                 // left shift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's OK.
 724                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 725             }
 726         }
 727 #undef SET_DIAG_MV
 728     }
 729
 730     if(topright_ref != PART_NOT_AVAILABLE){
 731         *C= h->mv_cache[list][ i - 8 + part_width ];
 732         return topright_ref;
 733     }else{
 734         tprintf(s->avctx, "topright MV not available\n");
 735
 736         *C= h->mv_cache[list][ i - 8 - 1 ];
 737         return h->ref_cache[list][ i - 8 - 1 ];
 738     }
 739 }
 740
 741 /**
 742  * gets the predicted MV.
 743  * @param n the block index
 744  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 745  * @param mx the x component of the predicted motion vector
 746  * @param my the y component of the predicted motion vector
 747  */
 748 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 749     const int index8= scan8[n];
 750     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 751     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 752     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 753     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 754     const int16_t * C;
 755     int diagonal_ref, match_count;
 756
 757     assert(part_width==1 || part_width==2 || part_width==4);
 758
 759 /* mv_cache
 760   B . . A T T T T
 761   U . . L . . , .
 762   U . . L . . . .
 763   U . . L . . , .
 764   . . . L . . . .
 765 */
 766
 767     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 768     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 769     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 770     if(match_count > 1){ //most common
 771         *mx= mid_pred(A[0], B[0], C[0]);
 772         *my= mid_pred(A[1], B[1], C[1]);
 773     }else if(match_count==1){
 774         if(left_ref==ref){
 775             *mx= A[0];
 776             *my= A[1];
 777         }else if(top_ref==ref){
 778             *mx= B[0];
 779             *my= B[1];
 780         }else{
 781             *mx= C[0];
 782             *my= C[1];
 783         }
 784     }else{
 785         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 786             *mx= A[0];
 787             *my= A[1];
 788         }else{
 789             *mx= mid_pred(A[0], B[0], C[0]);
 790             *my= mid_pred(A[1], B[1], C[1]);
 791         }
 792     }
 793
 794     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 795 }
 796
 797 /**
 798  * gets the directionally predicted 16x8 MV.
 799  * @param n the block index
 800  * @param mx the x component of the predicted motion vector
 801  * @param my the y component of the predicted motion vector
 802  */
 803 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 804     if(n==0){
 805         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 806         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 807
 808         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 809
 810         if(top_ref == ref){
 811             *mx= B[0];
 812             *my= B[1];
 813             return;
 814         }
 815     }else{
 816         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 817         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 818
 819         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 820
 821         if(left_ref == ref){
 822             *mx= A[0];
 823             *my= A[1];
 824             return;
 825         }
 826     }
 827
 828     //RARE
 829     pred_motion(h, n, 4, list, ref, mx, my);
 830 }
 831
 832 /**
 833  * gets the directionally predicted 8x16 MV.
 834  * @param n the block index
 835  * @param mx the x component of the predicted motion vector
 836  * @param my the y component of the predicted motion vector
 837  */
 838 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 839     if(n==0){
 840         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 841         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 842
 843         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 844
 845         if(left_ref == ref){
 846             *mx= A[0];
 847             *my= A[1];
 848             return;
 849         }
 850     }else{
 851         const int16_t * C;
 852         int diagonal_ref;
 853
 854         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 855
 856         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 857
 858         if(diagonal_ref == ref){
 859             *mx= C[0];
 860             *my= C[1];
 861             return;
 862         }
 863     }
 864
 865     //RARE
 866     pred_motion(h, n, 2, list, ref, mx, my);
 867 }
 868
 869 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 870     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 871     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 872
 873     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 874
 875     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 876        || !( top_ref | *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ])
 877        || !(left_ref | *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ])){
 878
 879         *mx = *my = 0;
 880         return;
 881     }
 882
 883     pred_motion(h, 0, 4, 0, 0, mx, my);
 884
 885     return;
 886 }
 887
 888 static int get_scale_factor(H264Context * const h, int poc, int poc1, int i){
 889     int poc0 = h->ref_list[0][i].poc;
 890     int td = av_clip(poc1 - poc0, -128, 127);
 891     if(td == 0 || h->ref_list[0][i].long_ref){
 892         return 256;
 893     }else{
 894         int tb = av_clip(poc - poc0, -128, 127);
 895         int tx = (16384 + (FFABS(td) >> 1)) / td;
 896         return av_clip((tb*tx + 32) >> 6, -1024, 1023);
 897     }
 898 }
 899
 900 static inline void direct_dist_scale_factor(H264Context * const h){
 901     MpegEncContext * const s = &h->s;
 902     const int poc = h->s.current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
 903     const int poc1 = h->ref_list[1][0].poc;
 904     int i, field;
 905     for(field=0; field<2; field++){
 906         const int poc  = h->s.current_picture_ptr->field_poc[field];
 907         const int poc1 = h->ref_list[1][0].field_poc[field];
 908         for(i=0; i < 2*h->ref_count[0]; i++)
 909             h->dist_scale_factor_field[field][i^field] = get_scale_factor(h, poc, poc1, i+16);
 910     }
 911
 912     for(i=0; i<h->ref_count[0]; i++){
 913         h->dist_scale_factor[i] = get_scale_factor(h, poc, poc1, i);
 914     }
 915 }
 916
 917 static void fill_colmap(H264Context *h, int map[2][16+32], int list, int field, int colfield, int mbafi){
 918     MpegEncContext * const s = &h->s;
 919     Picture * const ref1 = &h->ref_list[1][0];
 920     int j, old_ref, rfield;
 921     int start= mbafi ? 16                      : 0;
 922     int end  = mbafi ? 16+2*h->ref_count[list] : h->ref_count[list];
 923     int interl= mbafi || s->picture_structure != PICT_FRAME;
 924
 925     /* bogus; fills in for missing frames */
 926     memset(map[list], 0, sizeof(map[list]));
 927
 928     for(rfield=0; rfield<2; rfield++){
 929         for(old_ref=0; old_ref<ref1->ref_count[colfield][list]; old_ref++){
 930             int poc = ref1->ref_poc[colfield][list][old_ref];
 931
 932             if     (!interl)
 933                 poc |= 3;
 934             else if( interl && (poc&3) == 3) //FIXME store all MBAFF references so this isnt needed
 935                 poc= (poc&~3) + rfield + 1;
 936
 937             for(j=start; j<end; j++){
 938                 if(4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3) == poc){
 939                     int cur_ref= mbafi ? (j-16)^field : j;
 940                     map[list][2*old_ref + (rfield^field) + 16] = cur_ref;
 941                     if(rfield == field)
 942                         map[list][old_ref] = cur_ref;
 943                     break;
 944                 }
 945             }
 946         }
 947     }
 948 }
 949
 950 static inline void direct_ref_list_init(H264Context * const h){
 951     MpegEncContext * const s = &h->s;
 952     Picture * const ref1 = &h->ref_list[1][0];
 953     Picture * const cur = s->current_picture_ptr;
 954     int list, j, field;
 955     int sidx= (s->picture_structure&1)^1;
 956     int ref1sidx= (ref1->reference&1)^1;
 957
 958     for(list=0; list<2; list++){
 959         cur->ref_count[sidx][list] = h->ref_count[list];
 960         for(j=0; j<h->ref_count[list]; j++)
 961             cur->ref_poc[sidx][list][j] = 4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3);
 962     }
 963
 964     if(s->picture_structure == PICT_FRAME){
 965         memcpy(cur->ref_count[1], cur->ref_count[0], sizeof(cur->ref_count[0]));
 966         memcpy(cur->ref_poc  [1], cur->ref_poc  [0], sizeof(cur->ref_poc  [0]));
 967     }
 968
 969     cur->mbaff= FRAME_MBAFF;
 970
 971     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 972         return;
 973
 974     for(list=0; list<2; list++){
 975         fill_colmap(h, h->map_col_to_list0, list, sidx, ref1sidx, 0);
 976         for(field=0; field<2; field++)
 977             fill_colmap(h, h->map_col_to_list0_field[field], list, field, field, 1);
 978     }
 979 }
 980
 981 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 982     MpegEncContext * const s = &h->s;
 983     int b8_stride = h->b8_stride;
 984     int b4_stride = h->b_stride;
 985     int mb_xy = h->mb_xy;
 986     int mb_type_col[2];
 987     const int16_t (*l1mv0)[2], (*l1mv1)[2];
 988     const int8_t *l1ref0, *l1ref1;
 989     const int is_b8x8 = IS_8X8(*mb_type);
 990     unsigned int sub_mb_type;
 991     int i8, i4;
 992
 993 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
 994
 995     if(IS_INTERLACED(h->ref_list[1][0].mb_type[mb_xy])){ // AFL/AFR/FR/FL -> AFL/FL
 996         if(!IS_INTERLACED(*mb_type)){                    //     AFR/FR    -> AFL/FL
 997             int cur_poc = s->current_picture_ptr->poc;
 998             int *col_poc = h->ref_list[1]->field_poc;
 999             int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
1000             mb_xy= s->mb_x + ((s->mb_y&~1) + col_parity)*s->mb_stride;
1001             b8_stride = 0;
1002         }else if(!(s->picture_structure & h->ref_list[1][0].reference) && !h->ref_list[1][0].mbaff){// FL -> FL & differ parity
1003             int fieldoff= 2*(h->ref_list[1][0].reference)-3;
1004             mb_xy += s->mb_stride*fieldoff;
1005         }
1006         goto single_col;
1007     }else{                                               // AFL/AFR/FR/FL -> AFR/FR
1008         if(IS_INTERLACED(*mb_type)){                     // AFL       /FL -> AFR/FR
1009             mb_xy= s->mb_x + (s->mb_y&~1)*s->mb_stride;
1010             mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy];
1011             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride];
1012             b8_stride *= 3;
1013             b4_stride *= 6;
1014             //FIXME IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag
1015             if(    (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)
1016                 && (mb_type_col[1] & MB_TYPE_16x16_OR_INTRA)
1017                 && !is_b8x8){
1018                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1019                 *mb_type   |= MB_TYPE_16x8 |MB_TYPE_L0L1|MB_TYPE_DIRECT2; /* B_16x8 */
1020             }else{
1021                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1022                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1023             }
1024         }else{                                           //     AFR/FR    -> AFR/FR
1025 single_col:
1026             mb_type_col[0] =
1027             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy];
1028             if(IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag){
1029                 /* FIXME save sub mb types from previous frames (or derive from MVs)
1030                 * so we know exactly what block size to use */
1031                 sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1032                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1033             }else if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
1034                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1035                 *mb_type   |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1036             }else{
1037                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1038                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1039             }
1040         }
1041     }
1042
1043     l1mv0  = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]];
1044     l1mv1  = &h->ref_list[1][0].motion_val[1][h->mb2b_xy [mb_xy]];
1045     l1ref0 = &h->ref_list[1][0].ref_index [0][h->mb2b8_xy[mb_xy]];
1046     l1ref1 = &h->ref_list[1][0].ref_index [1][h->mb2b8_xy[mb_xy]];
1047     if(!b8_stride){
1048         if(s->mb_y&1){
1049             l1ref0 += h->b8_stride;
1050             l1ref1 += h->b8_stride;
1051             l1mv0  +=  2*b4_stride;
1052             l1mv1  +=  2*b4_stride;
1053         }
1054     }
1055
1056     if(h->direct_spatial_mv_pred){
1057         int ref[2];
1058         int mv[2][2];
1059         int list;
1060
1061         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1062
1063         /* ref = min(neighbors) */
1064         for(list=0; list<2; list++){
1065             int refa = h->ref_cache[list][scan8[0] - 1];
1066             int refb = h->ref_cache[list][scan8[0] - 8];
1067             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1068             if(refc == PART_NOT_AVAILABLE)
1069                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1070             ref[list] = FFMIN3((unsigned)refa, (unsigned)refb, (unsigned)refc);
1071             if(ref[list] < 0)
1072                 ref[list] = -1;
1073         }
1074
1075         if(ref[0] < 0 && ref[1] < 0){
1076             ref[0] = ref[1] = 0;
1077             mv[0][0] = mv[0][1] =
1078             mv[1][0] = mv[1][1] = 0;
1079         }else{
1080             for(list=0; list<2; list++){
1081                 if(ref[list] >= 0)
1082                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1083                 else
1084                     mv[list][0] = mv[list][1] = 0;
1085             }
1086         }
1087
1088         if(ref[1] < 0){
1089             if(!is_b8x8)
1090                 *mb_type &= ~MB_TYPE_L1;
1091             sub_mb_type &= ~MB_TYPE_L1;
1092         }else if(ref[0] < 0){
1093             if(!is_b8x8)
1094                 *mb_type &= ~MB_TYPE_L0;
1095             sub_mb_type &= ~MB_TYPE_L0;
1096         }
1097
1098         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1099             for(i8=0; i8<4; i8++){
1100                 int x8 = i8&1;
1101                 int y8 = i8>>1;
1102                 int xy8 = x8+y8*b8_stride;
1103                 int xy4 = 3*x8+y8*b4_stride;
1104                 int a=0, b=0;
1105
1106                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1107                     continue;
1108                 h->sub_mb_type[i8] = sub_mb_type;
1109
1110                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1111                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1112                 if(!IS_INTRA(mb_type_col[y8])
1113                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1114                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1115                     if(ref[0] > 0)
1116                         a= pack16to32(mv[0][0],mv[0][1]);
1117                     if(ref[1] > 0)
1118                         b= pack16to32(mv[1][0],mv[1][1]);
1119                 }else{
1120                     a= pack16to32(mv[0][0],mv[0][1]);
1121                     b= pack16to32(mv[1][0],mv[1][1]);
1122                 }
1123                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1124                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1125             }
1126         }else if(IS_16X16(*mb_type)){
1127             int a=0, b=0;
1128
1129             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1130             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1131             if(!IS_INTRA(mb_type_col[0])
1132                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1133                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1134                        && (h->x264_build>33 || !h->x264_build)))){
1135                 if(ref[0] > 0)
1136                     a= pack16to32(mv[0][0],mv[0][1]);
1137                 if(ref[1] > 0)
1138                     b= pack16to32(mv[1][0],mv[1][1]);
1139             }else{
1140                 a= pack16to32(mv[0][0],mv[0][1]);
1141                 b= pack16to32(mv[1][0],mv[1][1]);
1142             }
1143             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1144             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1145         }else{
1146             for(i8=0; i8<4; i8++){
1147                 const int x8 = i8&1;
1148                 const int y8 = i8>>1;
1149
1150                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1151                     continue;
1152                 h->sub_mb_type[i8] = sub_mb_type;
1153
1154                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1155                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1156                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1157                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1158
1159                 /* col_zero_flag */
1160                 if(!IS_INTRA(mb_type_col[0]) && (   l1ref0[x8 + y8*b8_stride] == 0
1161                                               || (l1ref0[x8 + y8*b8_stride] < 0 && l1ref1[x8 + y8*b8_stride] == 0
1162                                                   && (h->x264_build>33 || !h->x264_build)))){
1163                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*b8_stride] == 0 ? l1mv0 : l1mv1;
1164                     if(IS_SUB_8X8(sub_mb_type)){
1165                         const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1166                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1167                             if(ref[0] == 0)
1168                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1169                             if(ref[1] == 0)
1170                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1171                         }
1172                     }else
1173                     for(i4=0; i4<4; i4++){
1174                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1175                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1176                             if(ref[0] == 0)
1177                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1178                             if(ref[1] == 0)
1179                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1180                         }
1181                     }
1182                 }
1183             }
1184         }
1185     }else{ /* direct temporal mv pred */
1186         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1187         const int *dist_scale_factor = h->dist_scale_factor;
1188         int ref_offset= 0;
1189
1190         if(FRAME_MBAFF && IS_INTERLACED(*mb_type)){
1191             map_col_to_list0[0] = h->map_col_to_list0_field[s->mb_y&1][0];
1192             map_col_to_list0[1] = h->map_col_to_list0_field[s->mb_y&1][1];
1193             dist_scale_factor   =h->dist_scale_factor_field[s->mb_y&1];
1194         }
1195         if(h->ref_list[1][0].mbaff && IS_INTERLACED(mb_type_col[0]))
1196             ref_offset += 16;
1197
1198         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1199             /* FIXME assumes direct_8x8_inference == 1 */
1200             int y_shift  = 2*!IS_INTERLACED(*mb_type);
1201
1202             for(i8=0; i8<4; i8++){
1203                 const int x8 = i8&1;
1204                 const int y8 = i8>>1;
1205                 int ref0, scale;
1206                 const int16_t (*l1mv)[2]= l1mv0;
1207
1208                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1209                     continue;
1210                 h->sub_mb_type[i8] = sub_mb_type;
1211
1212                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1213                 if(IS_INTRA(mb_type_col[y8])){
1214                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1215                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1216                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1217                     continue;
1218                 }
1219
1220                 ref0 = l1ref0[x8 + y8*b8_stride];
1221                 if(ref0 >= 0)
1222                     ref0 = map_col_to_list0[0][ref0 + ref_offset];
1223                 else{
1224                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1225                     l1mv= l1mv1;
1226                 }
1227                 scale = dist_scale_factor[ref0];
1228                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1229
1230                 {
1231                     const int16_t *mv_col = l1mv[x8*3 + y8*b4_stride];
1232                     int my_col = (mv_col[1]<<y_shift)/2;
1233                     int mx = (scale * mv_col[0] + 128) >> 8;
1234                     int my = (scale * my_col + 128) >> 8;
1235                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1236                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1237                 }
1238             }
1239             return;
1240         }
1241
1242         /* one-to-one mv scaling */
1243
1244         if(IS_16X16(*mb_type)){
1245             int ref, mv0, mv1;
1246
1247             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1248             if(IS_INTRA(mb_type_col[0])){
1249                 ref=mv0=mv1=0;
1250             }else{
1251                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0] + ref_offset]
1252                                                 : map_col_to_list0[1][l1ref1[0] + ref_offset];
1253                 const int scale = dist_scale_factor[ref0];
1254                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1255                 int mv_l0[2];
1256                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1257                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1258                 ref= ref0;
1259                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1260                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1261             }
1262             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1263             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1264             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1265         }else{
1266             for(i8=0; i8<4; i8++){
1267                 const int x8 = i8&1;
1268                 const int y8 = i8>>1;
1269                 int ref0, scale;
1270                 const int16_t (*l1mv)[2]= l1mv0;
1271
1272                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1273                     continue;
1274                 h->sub_mb_type[i8] = sub_mb_type;
1275                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1276                 if(IS_INTRA(mb_type_col[0])){
1277                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1278                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1279                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1280                     continue;
1281                 }
1282
1283                 ref0 = l1ref0[x8 + y8*b8_stride] + ref_offset;
1284                 if(ref0 >= 0)
1285                     ref0 = map_col_to_list0[0][ref0];
1286                 else{
1287                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1288                     l1mv= l1mv1;
1289                 }
1290                 scale = dist_scale_factor[ref0];
1291
1292                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1293                 if(IS_SUB_8X8(sub_mb_type)){
1294                     const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1295                     int mx = (scale * mv_col[0] + 128) >> 8;
1296                     int my = (scale * mv_col[1] + 128) >> 8;
1297                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1298                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1299                 }else
1300                 for(i4=0; i4<4; i4++){
1301                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1302                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1303                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1304                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1305                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1306                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1307                 }
1308             }
1309         }
1310     }
1311 }
1312
1313 static inline void write_back_motion(H264Context *h, int mb_type){
1314     MpegEncContext * const s = &h->s;
1315     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1316     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1317     int list;
1318
1319     if(!USES_LIST(mb_type, 0))
1320         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1321
1322     for(list=0; list<h->list_count; list++){
1323         int y;
1324         if(!USES_LIST(mb_type, list))
1325             continue;
1326
1327         for(y=0; y<4; y++){
1328             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1329             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1330         }
1331         if( h->pps.cabac ) {
1332             if(IS_SKIP(mb_type))
1333                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1334             else
1335             for(y=0; y<4; y++){
1336                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1337                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1338             }
1339         }
1340
1341         {
1342             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1343             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1344             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1345             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1346             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1347         }
1348     }
1349
1350     if(h->slice_type_nos == FF_B_TYPE && h->pps.cabac){
1351         if(IS_8X8(mb_type)){
1352             uint8_t *direct_table = &h->direct_table[b8_xy];
1353             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1354             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1355             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1356         }
1357     }
1358 }
1359
1360 /**
1361  * Decodes a network abstraction layer unit.
1362  * @param consumed is the number of bytes used as input
1363  * @param length is the length of the array
1364  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1365  * @returns decoded bytes, might be src+1 if no escapes
1366  */
1367 static const uint8_t *decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1368     int i, si, di;
1369     uint8_t *dst;
1370     int bufidx;
1371
1372 //    src[0]&0x80;                //forbidden bit
1373     h->nal_ref_idc= src[0]>>5;
1374     h->nal_unit_type= src[0]&0x1F;
1375
1376     src++; length--;
1377 #if 0
1378     for(i=0; i<length; i++)
1379         printf("%2X ", src[i]);
1380 #endif
1381
1382 #ifdef HAVE_FAST_UNALIGNED
1383 # ifdef HAVE_FAST_64BIT
1384 #   define RS 7
1385     for(i=0; i+1<length; i+=9){
1386         if(!((~*(uint64_t*)(src+i) & (*(uint64_t*)(src+i) - 0x0100010001000101ULL)) & 0x8000800080008080ULL))
1387 # else
1388 #   define RS 3
1389     for(i=0; i+1<length; i+=5){
1390         if(!((~*(uint32_t*)(src+i) & (*(uint32_t*)(src+i) - 0x01000101U)) & 0x80008080U))
1391 # endif
1392             continue;
1393         if(i>0 && !src[i]) i--;
1394         while(src[i]) i++;
1395 #else
1396 #   define RS 0
1397     for(i=0; i+1<length; i+=2){
1398         if(src[i]) continue;
1399         if(i>0 && src[i-1]==0) i--;
1400 #endif
1401         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1402             if(src[i+2]!=3){
1403                 /* startcode, so we must be past the end */
1404                 length=i;
1405             }
1406             break;
1407         }
1408         i-= RS;
1409     }
1410
1411     if(i>=length-1){ //no escaped 0
1412         *dst_length= length;
1413         *consumed= length+1; //+1 for the header
1414         return src;
1415     }
1416
1417     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1418     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length+FF_INPUT_BUFFER_PADDING_SIZE);
1419     dst= h->rbsp_buffer[bufidx];
1420
1421     if (dst == NULL){
1422         return NULL;
1423     }
1424
1425 //printf("decoding esc\n");
1426     memcpy(dst, src, i);
1427     si=di=i;
1428     while(si+2<length){
1429         //remove escapes (very rare 1:2^22)
1430         if(src[si+2]>3){
1431             dst[di++]= src[si++];
1432             dst[di++]= src[si++];
1433         }else if(src[si]==0 && src[si+1]==0){
1434             if(src[si+2]==3){ //escape
1435                 dst[di++]= 0;
1436                 dst[di++]= 0;
1437                 si+=3;
1438                 continue;
1439             }else //next start code
1440                 goto nsc;
1441         }
1442
1443         dst[di++]= src[si++];
1444     }
1445     while(si<length)
1446         dst[di++]= src[si++];
1447 nsc:
1448
1449     memset(dst+di, 0, FF_INPUT_BUFFER_PADDING_SIZE);
1450
1451     *dst_length= di;
1452     *consumed= si + 1;//+1 for the header
1453 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1454     return dst;
1455 }
1456
1457 /**
1458  * identifies the exact end of the bitstream
1459  * @return the length of the trailing, or 0 if damaged
1460  */
1461 static int decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1462     int v= *src;
1463     int r;
1464
1465     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1466
1467     for(r=1; r<9; r++){
1468         if(v&1) return r;
1469         v>>=1;
1470     }
1471     return 0;
1472 }
1473
1474 /**
1475  * IDCT transforms the 16 dc values and dequantizes them.
1476  * @param qp quantization parameter
1477  */
1478 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1479 #define stride 16
1480     int i;
1481     int temp[16]; //FIXME check if this is a good idea
1482     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1483     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1484
1485 //memset(block, 64, 2*256);
1486 //return;
1487     for(i=0; i<4; i++){
1488         const int offset= y_offset[i];
1489         const int z0= block[offset+stride*0] + block[offset+stride*4];
1490         const int z1= block[offset+stride*0] - block[offset+stride*4];
1491         const int z2= block[offset+stride*1] - block[offset+stride*5];
1492         const int z3= block[offset+stride*1] + block[offset+stride*5];
1493
1494         temp[4*i+0]= z0+z3;
1495         temp[4*i+1]= z1+z2;
1496         temp[4*i+2]= z1-z2;
1497         temp[4*i+3]= z0-z3;
1498     }
1499
1500     for(i=0; i<4; i++){
1501         const int offset= x_offset[i];
1502         const int z0= temp[4*0+i] + temp[4*2+i];
1503         const int z1= temp[4*0+i] - temp[4*2+i];
1504         const int z2= temp[4*1+i] - temp[4*3+i];
1505         const int z3= temp[4*1+i] + temp[4*3+i];
1506
1507         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
1508         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1509         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1510         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1511     }
1512 }
1513
1514 #if 0
1515 /**
1516  * DCT transforms the 16 dc values.
1517  * @param qp quantization parameter ??? FIXME
1518  */
1519 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1520 //    const int qmul= dequant_coeff[qp][0];
1521     int i;
1522     int temp[16]; //FIXME check if this is a good idea
1523     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1524     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1525
1526     for(i=0; i<4; i++){
1527         const int offset= y_offset[i];
1528         const int z0= block[offset+stride*0] + block[offset+stride*4];
1529         const int z1= block[offset+stride*0] - block[offset+stride*4];
1530         const int z2= block[offset+stride*1] - block[offset+stride*5];
1531         const int z3= block[offset+stride*1] + block[offset+stride*5];
1532
1533         temp[4*i+0]= z0+z3;
1534         temp[4*i+1]= z1+z2;
1535         temp[4*i+2]= z1-z2;
1536         temp[4*i+3]= z0-z3;
1537     }
1538
1539     for(i=0; i<4; i++){
1540         const int offset= x_offset[i];
1541         const int z0= temp[4*0+i] + temp[4*2+i];
1542         const int z1= temp[4*0+i] - temp[4*2+i];
1543         const int z2= temp[4*1+i] - temp[4*3+i];
1544         const int z3= temp[4*1+i] + temp[4*3+i];
1545
1546         block[stride*0 +offset]= (z0 + z3)>>1;
1547         block[stride*2 +offset]= (z1 + z2)>>1;
1548         block[stride*8 +offset]= (z1 - z2)>>1;
1549         block[stride*10+offset]= (z0 - z3)>>1;
1550     }
1551 }
1552 #endif
1553
1554 #undef xStride
1555 #undef stride
1556
1557 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1558     const int stride= 16*2;
1559     const int xStride= 16;
1560     int a,b,c,d,e;
1561
1562     a= block[stride*0 + xStride*0];
1563     b= block[stride*0 + xStride*1];
1564     c= block[stride*1 + xStride*0];
1565     d= block[stride*1 + xStride*1];
1566
1567     e= a-b;
1568     a= a+b;
1569     b= c-d;
1570     c= c+d;
1571
1572     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1573     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1574     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1575     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1576 }
1577
1578 #if 0
1579 static void chroma_dc_dct_c(DCTELEM *block){
1580     const int stride= 16*2;
1581     const int xStride= 16;
1582     int a,b,c,d,e;
1583
1584     a= block[stride*0 + xStride*0];
1585     b= block[stride*0 + xStride*1];
1586     c= block[stride*1 + xStride*0];
1587     d= block[stride*1 + xStride*1];
1588
1589     e= a-b;
1590     a= a+b;
1591     b= c-d;
1592     c= c+d;
1593
1594     block[stride*0 + xStride*0]= (a+c);
1595     block[stride*0 + xStride*1]= (e+b);
1596     block[stride*1 + xStride*0]= (a-c);
1597     block[stride*1 + xStride*1]= (e-b);
1598 }
1599 #endif
1600
1601 /**
1602  * gets the chroma qp.
1603  */
1604 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1605     return h->pps.chroma_qp_table[t][qscale];
1606 }
1607
1608 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1609                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1610                            int src_x_offset, int src_y_offset,
1611                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1612     MpegEncContext * const s = &h->s;
1613     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1614     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1615     const int luma_xy= (mx&3) + ((my&3)<<2);
1616     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1617     uint8_t * src_cb, * src_cr;
1618     int extra_width= h->emu_edge_width;
1619     int extra_height= h->emu_edge_height;
1620     int emu=0;
1621     const int full_mx= mx>>2;
1622     const int full_my= my>>2;
1623     const int pic_width  = 16*s->mb_width;
1624     const int pic_height = 16*s->mb_height >> MB_FIELD;
1625
1626     if(mx&7) extra_width -= 3;
1627     if(my&7) extra_height -= 3;
1628
1629     if(   full_mx < 0-extra_width
1630        || full_my < 0-extra_height
1631        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1632        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1633         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1634             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1635         emu=1;
1636     }
1637
1638     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1639     if(!square){
1640         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1641     }
1642
1643     if(ENABLE_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1644
1645     if(MB_FIELD){
1646         // chroma offset when predicting from a field of opposite parity
1647         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1648         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1649     }
1650     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1651     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1652
1653     if(emu){
1654         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1655             src_cb= s->edge_emu_buffer;
1656     }
1657     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1658
1659     if(emu){
1660         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1661             src_cr= s->edge_emu_buffer;
1662     }
1663     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1664 }
1665
1666 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1667                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1668                            int x_offset, int y_offset,
1669                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1670                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1671                            int list0, int list1){
1672     MpegEncContext * const s = &h->s;
1673     qpel_mc_func *qpix_op=  qpix_put;
1674     h264_chroma_mc_func chroma_op= chroma_put;
1675
1676     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1677     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1678     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1679     x_offset += 8*s->mb_x;
1680     y_offset += 8*(s->mb_y >> MB_FIELD);
1681
1682     if(list0){
1683         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1684         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1685                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1686                            qpix_op, chroma_op);
1687
1688         qpix_op=  qpix_avg;
1689         chroma_op= chroma_avg;
1690     }
1691
1692     if(list1){
1693         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1694         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1695                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1696                            qpix_op, chroma_op);
1697     }
1698 }
1699
1700 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1701                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1702                            int x_offset, int y_offset,
1703                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1704                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1705                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1706                            int list0, int list1){
1707     MpegEncContext * const s = &h->s;
1708
1709     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1710     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1711     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1712     x_offset += 8*s->mb_x;
1713     y_offset += 8*(s->mb_y >> MB_FIELD);
1714
1715     if(list0 && list1){
1716         /* don't optimize for luma-only case, since B-frames usually
1717          * use implicit weights => chroma too. */
1718         uint8_t *tmp_cb = s->obmc_scratchpad;
1719         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1720         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1721         int refn0 = h->ref_cache[0][ scan8[n] ];
1722         int refn1 = h->ref_cache[1][ scan8[n] ];
1723
1724         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1725                     dest_y, dest_cb, dest_cr,
1726                     x_offset, y_offset, qpix_put, chroma_put);
1727         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1728                     tmp_y, tmp_cb, tmp_cr,
1729                     x_offset, y_offset, qpix_put, chroma_put);
1730
1731         if(h->use_weight == 2){
1732             int weight0 = h->implicit_weight[refn0][refn1];
1733             int weight1 = 64 - weight0;
1734             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1735             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1736             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1737         }else{
1738             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1739                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1740                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1741             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1742                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1743                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1744             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1745                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1746                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1747         }
1748     }else{
1749         int list = list1 ? 1 : 0;
1750         int refn = h->ref_cache[list][ scan8[n] ];
1751         Picture *ref= &h->ref_list[list][refn];
1752         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1753                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1754                     qpix_put, chroma_put);
1755
1756         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1757                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1758         if(h->use_weight_chroma){
1759             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1760                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1761             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1762                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1763         }
1764     }
1765 }
1766
1767 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1768                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1769                            int x_offset, int y_offset,
1770                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1771                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1772                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1773                            int list0, int list1){
1774     if((h->use_weight==2 && list0 && list1
1775         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1776        || h->use_weight==1)
1777         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1778                          x_offset, y_offset, qpix_put, chroma_put,
1779                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1780     else
1781         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1782                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1783 }
1784
1785 static inline void prefetch_motion(H264Context *h, int list){
1786     /* fetch pixels for estimated mv 4 macroblocks ahead
1787      * optimized for 64byte cache lines */
1788     MpegEncContext * const s = &h->s;
1789     const int refn = h->ref_cache[list][scan8[0]];
1790     if(refn >= 0){
1791         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1792         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1793         uint8_t **src= h->ref_list[list][refn].data;
1794         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1795         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1796         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1797         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1798     }
1799 }
1800
1801 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1802                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1803                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1804                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1805     MpegEncContext * const s = &h->s;
1806     const int mb_xy= h->mb_xy;
1807     const int mb_type= s->current_picture.mb_type[mb_xy];
1808
1809     assert(IS_INTER(mb_type));
1810
1811     prefetch_motion(h, 0);
1812
1813     if(IS_16X16(mb_type)){
1814         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1815                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1816                 &weight_op[0], &weight_avg[0],
1817                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1818     }else if(IS_16X8(mb_type)){
1819         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1820                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1821                 &weight_op[1], &weight_avg[1],
1822                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1823         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1824                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1825                 &weight_op[1], &weight_avg[1],
1826                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1827     }else if(IS_8X16(mb_type)){
1828         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1829                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1830                 &weight_op[2], &weight_avg[2],
1831                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1832         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1833                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1834                 &weight_op[2], &weight_avg[2],
1835                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1836     }else{
1837         int i;
1838
1839         assert(IS_8X8(mb_type));
1840
1841         for(i=0; i<4; i++){
1842             const int sub_mb_type= h->sub_mb_type[i];
1843             const int n= 4*i;
1844             int x_offset= (i&1)<<2;
1845             int y_offset= (i&2)<<1;
1846
1847             if(IS_SUB_8X8(sub_mb_type)){
1848                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1849                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1850                     &weight_op[3], &weight_avg[3],
1851                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1852             }else if(IS_SUB_8X4(sub_mb_type)){
1853                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1854                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1855                     &weight_op[4], &weight_avg[4],
1856                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1857                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1858                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1859                     &weight_op[4], &weight_avg[4],
1860                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1861             }else if(IS_SUB_4X8(sub_mb_type)){
1862                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1863                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1864                     &weight_op[5], &weight_avg[5],
1865                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1866                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1867                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1868                     &weight_op[5], &weight_avg[5],
1869                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1870             }else{
1871                 int j;
1872                 assert(IS_SUB_4X4(sub_mb_type));
1873                 for(j=0; j<4; j++){
1874                     int sub_x_offset= x_offset + 2*(j&1);
1875                     int sub_y_offset= y_offset +   (j&2);
1876                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1877                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1878                         &weight_op[6], &weight_avg[6],
1879                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1880                 }
1881             }
1882         }
1883     }
1884
1885     prefetch_motion(h, 1);
1886 }
1887
1888 static av_cold void init_cavlc_level_tab(void){
1889     int suffix_length, mask;
1890     unsigned int i;
1891
1892     for(suffix_length=0; suffix_length<7; suffix_length++){
1893         for(i=0; i<(1<<LEVEL_TAB_BITS); i++){
1894             int prefix= LEVEL_TAB_BITS - av_log2(2*i);
1895             int level_code= (prefix<<suffix_length) + (i>>(LEVEL_TAB_BITS-prefix-1-suffix_length)) - (1<<suffix_length);
1896
1897             mask= -(level_code&1);
1898             level_code= (((2+level_code)>>1) ^ mask) - mask;
1899             if(prefix + 1 + suffix_length <= LEVEL_TAB_BITS){
1900                 cavlc_level_tab[suffix_length][i][0]= level_code;
1901                 cavlc_level_tab[suffix_length][i][1]= prefix + 1 + suffix_length;
1902             }else if(prefix + 1 <= LEVEL_TAB_BITS){
1903                 cavlc_level_tab[suffix_length][i][0]= prefix+100;
1904                 cavlc_level_tab[suffix_length][i][1]= prefix + 1;
1905             }else{
1906                 cavlc_level_tab[suffix_length][i][0]= LEVEL_TAB_BITS+100;
1907                 cavlc_level_tab[suffix_length][i][1]= LEVEL_TAB_BITS;
1908             }
1909         }
1910     }
1911 }
1912
1913 static av_cold void decode_init_vlc(void){
1914     static int done = 0;
1915
1916     if (!done) {
1917         int i;
1918         int offset;
1919         done = 1;
1920
1921         chroma_dc_coeff_token_vlc.table = chroma_dc_coeff_token_vlc_table;
1922         chroma_dc_coeff_token_vlc.table_allocated = chroma_dc_coeff_token_vlc_table_size;
1923         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1924                  &chroma_dc_coeff_token_len [0], 1, 1,
1925                  &chroma_dc_coeff_token_bits[0], 1, 1,
1926                  INIT_VLC_USE_NEW_STATIC);
1927
1928         offset = 0;
1929         for(i=0; i<4; i++){
1930             coeff_token_vlc[i].table = coeff_token_vlc_tables+offset;
1931             coeff_token_vlc[i].table_allocated = coeff_token_vlc_tables_size[i];
1932             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1933                      &coeff_token_len [i][0], 1, 1,
1934                      &coeff_token_bits[i][0], 1, 1,
1935                      INIT_VLC_USE_NEW_STATIC);
1936             offset += coeff_token_vlc_tables_size[i];
1937         }
1938         /*
1939          * This is a one time safety check to make sure that
1940          * the packed static coeff_token_vlc table sizes
1941          * were initialized correctly.
1942          */
1943         assert(offset == FF_ARRAY_ELEMS(coeff_token_vlc_tables));
1944
1945         for(i=0; i<3; i++){
1946             chroma_dc_total_zeros_vlc[i].table = chroma_dc_total_zeros_vlc_tables[i];
1947             chroma_dc_total_zeros_vlc[i].table_allocated = chroma_dc_total_zeros_vlc_tables_size;
1948             init_vlc(&chroma_dc_total_zeros_vlc[i],
1949                      CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1950                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1951                      &chroma_dc_total_zeros_bits[i][0], 1, 1,
1952                      INIT_VLC_USE_NEW_STATIC);
1953         }
1954         for(i=0; i<15; i++){
1955             total_zeros_vlc[i].table = total_zeros_vlc_tables[i];
1956             total_zeros_vlc[i].table_allocated = total_zeros_vlc_tables_size;
1957             init_vlc(&total_zeros_vlc[i],
1958                      TOTAL_ZEROS_VLC_BITS, 16,
1959                      &total_zeros_len [i][0], 1, 1,
1960                      &total_zeros_bits[i][0], 1, 1,
1961                      INIT_VLC_USE_NEW_STATIC);
1962         }
1963
1964         for(i=0; i<6; i++){
1965             run_vlc[i].table = run_vlc_tables[i];
1966             run_vlc[i].table_allocated = run_vlc_tables_size;
1967             init_vlc(&run_vlc[i],
1968                      RUN_VLC_BITS, 7,
1969                      &run_len [i][0], 1, 1,
1970                      &run_bits[i][0], 1, 1,
1971                      INIT_VLC_USE_NEW_STATIC);
1972         }
1973         run7_vlc.table = run7_vlc_table,
1974         run7_vlc.table_allocated = run7_vlc_table_size;
1975         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
1976                  &run_len [6][0], 1, 1,
1977                  &run_bits[6][0], 1, 1,
1978                  INIT_VLC_USE_NEW_STATIC);
1979
1980         init_cavlc_level_tab();
1981     }
1982 }
1983
1984 static void free_tables(H264Context *h){
1985     int i;
1986     H264Context *hx;
1987     av_freep(&h->intra4x4_pred_mode);
1988     av_freep(&h->chroma_pred_mode_table);
1989     av_freep(&h->cbp_table);
1990     av_freep(&h->mvd_table[0]);
1991     av_freep(&h->mvd_table[1]);
1992     av_freep(&h->direct_table);
1993     av_freep(&h->non_zero_count);
1994     av_freep(&h->slice_table_base);
1995     h->slice_table= NULL;
1996
1997     av_freep(&h->mb2b_xy);
1998     av_freep(&h->mb2b8_xy);
1999
2000     for(i = 0; i < h->s.avctx->thread_count; i++) {
2001         hx = h->thread_context[i];
2002         if(!hx) continue;
2003         av_freep(&hx->top_borders[1]);
2004         av_freep(&hx->top_borders[0]);
2005         av_freep(&hx->s.obmc_scratchpad);
2006     }
2007 }
2008
2009 static void init_dequant8_coeff_table(H264Context *h){
2010     int i,q,x;
2011     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2012     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2013     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2014
2015     for(i=0; i<2; i++ ){
2016         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2017             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2018             break;
2019         }
2020
2021         for(q=0; q<52; q++){
2022             int shift = div6[q];
2023             int idx = rem6[q];
2024             for(x=0; x<64; x++)
2025                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2026                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2027                     h->pps.scaling_matrix8[i][x]) << shift;
2028         }
2029     }
2030 }
2031
2032 static void init_dequant4_coeff_table(H264Context *h){
2033     int i,j,q,x;
2034     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2035     for(i=0; i<6; i++ ){
2036         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2037         for(j=0; j<i; j++){
2038             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2039                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2040                 break;
2041             }
2042         }
2043         if(j<i)
2044             continue;
2045
2046         for(q=0; q<52; q++){
2047             int shift = div6[q] + 2;
2048             int idx = rem6[q];
2049             for(x=0; x<16; x++)
2050                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2051                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2052                     h->pps.scaling_matrix4[i][x]) << shift;
2053         }
2054     }
2055 }
2056
2057 static void init_dequant_tables(H264Context *h){
2058     int i,x;
2059     init_dequant4_coeff_table(h);
2060     if(h->pps.transform_8x8_mode)
2061         init_dequant8_coeff_table(h);
2062     if(h->sps.transform_bypass){
2063         for(i=0; i<6; i++)
2064             for(x=0; x<16; x++)
2065                 h->dequant4_coeff[i][0][x] = 1<<6;
2066         if(h->pps.transform_8x8_mode)
2067             for(i=0; i<2; i++)
2068                 for(x=0; x<64; x++)
2069                     h->dequant8_coeff[i][0][x] = 1<<6;
2070     }
2071 }
2072
2073
2074 /**
2075  * allocates tables.
2076  * needs width/height
2077  */
2078 static int alloc_tables(H264Context *h){
2079     MpegEncContext * const s = &h->s;
2080     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2081     int x,y;
2082
2083     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2084
2085     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2086     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(*h->slice_table_base))
2087     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2088
2089     CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2090     CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2091     CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2092     CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2093
2094     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(*h->slice_table_base));
2095     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2096
2097     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2098     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2099     for(y=0; y<s->mb_height; y++){
2100         for(x=0; x<s->mb_width; x++){
2101             const int mb_xy= x + y*s->mb_stride;
2102             const int b_xy = 4*x + 4*y*h->b_stride;
2103             const int b8_xy= 2*x + 2*y*h->b8_stride;
2104
2105             h->mb2b_xy [mb_xy]= b_xy;
2106             h->mb2b8_xy[mb_xy]= b8_xy;
2107         }
2108     }
2109
2110     s->obmc_scratchpad = NULL;
2111
2112     if(!h->dequant4_coeff[0])
2113         init_dequant_tables(h);
2114
2115     return 0;
2116 fail:
2117     free_tables(h);
2118     return -1;
2119 }
2120
2121 /**
2122  * Mimic alloc_tables(), but for every context thread.
2123  */
2124 static void clone_tables(H264Context *dst, H264Context *src){
2125     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2126     dst->non_zero_count           = src->non_zero_count;
2127     dst->slice_table              = src->slice_table;
2128     dst->cbp_table                = src->cbp_table;
2129     dst->mb2b_xy                  = src->mb2b_xy;
2130     dst->mb2b8_xy                 = src->mb2b8_xy;
2131     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2132     dst->mvd_table[0]             = src->mvd_table[0];
2133     dst->mvd_table[1]             = src->mvd_table[1];
2134     dst->direct_table             = src->direct_table;
2135
2136     dst->s.obmc_scratchpad = NULL;
2137     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2138 }
2139
2140 /**
2141  * Init context
2142  * Allocate buffers which are not shared amongst multiple threads.
2143  */
2144 static int context_init(H264Context *h){
2145     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2146     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2147
2148     return 0;
2149 fail:
2150     return -1; // free_tables will clean up for us
2151 }
2152
2153 static av_cold void common_init(H264Context *h){
2154     MpegEncContext * const s = &h->s;
2155
2156     s->width = s->avctx->width;
2157     s->height = s->avctx->height;
2158     s->codec_id= s->avctx->codec->id;
2159
2160     ff_h264_pred_init(&h->hpc, s->codec_id);
2161
2162     h->dequant_coeff_pps= -1;
2163     s->unrestricted_mv=1;
2164     s->decode=1; //FIXME
2165
2166     dsputil_init(&s->dsp, s->avctx); // needed so that idct permutation is known early
2167
2168     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2169     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2170 }
2171
2172 static av_cold int decode_init(AVCodecContext *avctx){
2173     H264Context *h= avctx->priv_data;
2174     MpegEncContext * const s = &h->s;
2175
2176     MPV_decode_defaults(s);
2177
2178     s->avctx = avctx;
2179     common_init(h);
2180
2181     s->out_format = FMT_H264;
2182     s->workaround_bugs= avctx->workaround_bugs;
2183
2184     // set defaults
2185 //    s->decode_mb= ff_h263_decode_mb;
2186     s->quarter_sample = 1;
2187     s->low_delay= 1;
2188
2189     if(avctx->codec_id == CODEC_ID_SVQ3)
2190         avctx->pix_fmt= PIX_FMT_YUVJ420P;
2191     else
2192         avctx->pix_fmt= PIX_FMT_YUV420P;
2193
2194     decode_init_vlc();
2195
2196     if(avctx->extradata_size > 0 && avctx->extradata &&
2197        *(char *)avctx->extradata == 1){
2198         h->is_avc = 1;
2199         h->got_avcC = 0;
2200     } else {
2201         h->is_avc = 0;
2202     }
2203
2204     h->thread_context[0] = h;
2205     h->outputed_poc = INT_MIN;
2206     h->prev_poc_msb= 1<<16;
2207     return 0;
2208 }
2209
2210 static int frame_start(H264Context *h){
2211     MpegEncContext * const s = &h->s;
2212     int i;
2213
2214     if(MPV_frame_start(s, s->avctx) < 0)
2215         return -1;
2216     ff_er_frame_start(s);
2217     /*
2218      * MPV_frame_start uses pict_type to derive key_frame.
2219      * This is incorrect for H.264; IDR markings must be used.
2220      * Zero here; IDR markings per slice in frame or fields are ORed in later.
2221      * See decode_nal_units().
2222      */
2223     s->current_picture_ptr->key_frame= 0;
2224
2225     assert(s->linesize && s->uvlinesize);
2226
2227     for(i=0; i<16; i++){
2228         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2229         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2230     }
2231     for(i=0; i<4; i++){
2232         h->block_offset[16+i]=
2233         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2234         h->block_offset[24+16+i]=
2235         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2236     }
2237
2238     /* can't be in alloc_tables because linesize isn't known there.
2239      * FIXME: redo bipred weight to not require extra buffer? */
2240     for(i = 0; i < s->avctx->thread_count; i++)
2241         if(!h->thread_context[i]->s.obmc_scratchpad)
2242             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2243
2244     /* some macroblocks will be accessed before they're available */
2245     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2246         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(*h->slice_table));
2247
2248 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2249
2250     // We mark the current picture as non-reference after allocating it, so
2251     // that if we break out due to an error it can be released automatically
2252     // in the next MPV_frame_start().
2253     // SVQ3 as well as most other codecs have only last/next/current and thus
2254     // get released even with set reference, besides SVQ3 and others do not
2255     // mark frames as reference later "naturally".
2256     if(s->codec_id != CODEC_ID_SVQ3)
2257         s->current_picture_ptr->reference= 0;
2258
2259     s->current_picture_ptr->field_poc[0]=
2260     s->current_picture_ptr->field_poc[1]= INT_MAX;
2261     assert(s->current_picture_ptr->long_ref==0);
2262
2263     return 0;
2264 }
2265
2266 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2267     MpegEncContext * const s = &h->s;
2268     int i;
2269     int step    = 1;
2270     int offset  = 1;
2271     int uvoffset= 1;
2272     int top_idx = 1;
2273     int skiplast= 0;
2274
2275     src_y  -=   linesize;
2276     src_cb -= uvlinesize;
2277     src_cr -= uvlinesize;
2278
2279     if(!simple && FRAME_MBAFF){
2280         if(s->mb_y&1){
2281             offset  = MB_MBAFF ? 1 : 17;
2282             uvoffset= MB_MBAFF ? 1 : 9;
2283             if(!MB_MBAFF){
2284                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 0)= *(uint64_t*)(src_y +  15*linesize);
2285                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 8)= *(uint64_t*)(src_y +8+15*linesize);
2286                 if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2287                     *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+7*uvlinesize);
2288                     *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+7*uvlinesize);
2289                 }
2290             }
2291         }else{
2292             if(!MB_MBAFF){
2293                 h->left_border[0]= h->top_borders[0][s->mb_x][15];
2294                 if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2295                     h->left_border[34   ]= h->top_borders[0][s->mb_x][16+7  ];
2296                     h->left_border[34+18]= h->top_borders[0][s->mb_x][16+8+7];
2297                 }
2298                 skiplast= 1;
2299             }
2300             offset  =
2301             uvoffset=
2302             top_idx = MB_MBAFF ? 0 : 1;
2303         }
2304         step= MB_MBAFF ? 2 : 1;
2305     }
2306
2307     // There are two lines saved, the line above the the top macroblock of a pair,
2308     // and the line above the bottom macroblock
2309     h->left_border[offset]= h->top_borders[top_idx][s->mb_x][15];
2310     for(i=1; i<17 - skiplast; i++){
2311         h->left_border[offset+i*step]= src_y[15+i*  linesize];
2312     }
2313
2314     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2315     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2316
2317     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2318         h->left_border[uvoffset+34   ]= h->top_borders[top_idx][s->mb_x][16+7];
2319         h->left_border[uvoffset+34+18]= h->top_borders[top_idx][s->mb_x][24+7];
2320         for(i=1; i<9 - skiplast; i++){
2321             h->left_border[uvoffset+34   +i*step]= src_cb[7+i*uvlinesize];
2322             h->left_border[uvoffset+34+18+i*step]= src_cr[7+i*uvlinesize];
2323         }
2324         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2325         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2326     }
2327 }
2328
2329 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2330     MpegEncContext * const s = &h->s;
2331     int temp8, i;
2332     uint64_t temp64;
2333     int deblock_left;
2334     int deblock_top;
2335     int mb_xy;
2336     int step    = 1;
2337     int offset  = 1;
2338     int uvoffset= 1;
2339     int top_idx = 1;
2340
2341     if(!simple && FRAME_MBAFF){
2342         if(s->mb_y&1){
2343             offset  = MB_MBAFF ? 1 : 17;
2344             uvoffset= MB_MBAFF ? 1 : 9;
2345         }else{
2346             offset  =
2347             uvoffset=
2348             top_idx = MB_MBAFF ? 0 : 1;
2349         }
2350         step= MB_MBAFF ? 2 : 1;
2351     }
2352
2353     if(h->deblocking_filter == 2) {
2354         mb_xy = h->mb_xy;
2355         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2356         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2357     } else {
2358         deblock_left = (s->mb_x > 0);
2359         deblock_top =  (s->mb_y > !!MB_FIELD);
2360     }
2361
2362     src_y  -=   linesize + 1;
2363     src_cb -= uvlinesize + 1;
2364     src_cr -= uvlinesize + 1;
2365
2366 #define XCHG(a,b,t,xchg)\
2367 t= a;\
2368 if(xchg)\
2369     a= b;\
2370 b= t;
2371
2372     if(deblock_left){
2373         for(i = !deblock_top; i<16; i++){
2374             XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, xchg);
2375         }
2376         XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, 1);
2377     }
2378
2379     if(deblock_top){
2380         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2381         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2382         if(s->mb_x+1 < s->mb_width){
2383             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2384         }
2385     }
2386
2387     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2388         if(deblock_left){
2389             for(i = !deblock_top; i<8; i++){
2390                 XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, xchg);
2391                 XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, xchg);
2392             }
2393             XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, 1);
2394             XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, 1);
2395         }
2396         if(deblock_top){
2397             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2398             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2399         }
2400     }
2401 }
2402
2403 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2404     MpegEncContext * const s = &h->s;
2405     const int mb_x= s->mb_x;
2406     const int mb_y= s->mb_y;
2407     const int mb_xy= h->mb_xy;
2408     const int mb_type= s->current_picture.mb_type[mb_xy];
2409     uint8_t  *dest_y, *dest_cb, *dest_cr;
2410     int linesize, uvlinesize /*dct_offset*/;
2411     int i;
2412     int *block_offset = &h->block_offset[0];
2413     const int transform_bypass = !simple && (s->qscale == 0 && h->sps.transform_bypass);
2414     const int is_h264 = simple || s->codec_id == CODEC_ID_H264;
2415     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2416     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2417
2418     dest_y  = s->current_picture.data[0] + (mb_x + mb_y * s->linesize  ) * 16;
2419     dest_cb = s->current_picture.data[1] + (mb_x + mb_y * s->uvlinesize) * 8;
2420     dest_cr = s->current_picture.data[2] + (mb_x + mb_y * s->uvlinesize) * 8;
2421
2422     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2423     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2424
2425     if (!simple && MB_FIELD) {
2426         linesize   = h->mb_linesize   = s->linesize * 2;
2427         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2428         block_offset = &h->block_offset[24];
2429         if(mb_y&1){ //FIXME move out of this function?
2430             dest_y -= s->linesize*15;
2431             dest_cb-= s->uvlinesize*7;
2432             dest_cr-= s->uvlinesize*7;
2433         }
2434         if(FRAME_MBAFF) {
2435             int list;
2436             for(list=0; list<h->list_count; list++){
2437                 if(!USES_LIST(mb_type, list))
2438                     continue;
2439                 if(IS_16X16(mb_type)){
2440                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2441                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2442                 }else{
2443                     for(i=0; i<16; i+=4){
2444                         int ref = h->ref_cache[list][scan8[i]];
2445                         if(ref >= 0)
2446                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2447                     }
2448                 }
2449             }
2450         }
2451     } else {
2452         linesize   = h->mb_linesize   = s->linesize;
2453         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2454 //        dct_offset = s->linesize * 16;
2455     }
2456
2457     if (!simple && IS_INTRA_PCM(mb_type)) {
2458         for (i=0; i<16; i++) {
2459             memcpy(dest_y + i*  linesize, h->mb       + i*8, 16);
2460         }
2461         for (i=0; i<8; i++) {
2462             memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
2463             memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
2464         }
2465     } else {
2466         if(IS_INTRA(mb_type)){
2467             if(h->deblocking_filter)
2468                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2469
2470             if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2471                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2472                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2473             }
2474
2475             if(IS_INTRA4x4(mb_type)){
2476                 if(simple || !s->encoding){
2477                     if(IS_8x8DCT(mb_type)){
2478                         if(transform_bypass){
2479                             idct_dc_add =
2480                             idct_add    = s->dsp.add_pixels8;
2481                         }else{
2482                             idct_dc_add = s->dsp.h264_idct8_dc_add;
2483                             idct_add    = s->dsp.h264_idct8_add;
2484                         }
2485                         for(i=0; i<16; i+=4){
2486                             uint8_t * const ptr= dest_y + block_offset[i];
2487                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2488                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2489                                 h->hpc.pred8x8l_add[dir](ptr, h->mb + i*16, linesize);
2490                             }else{
2491                                 const int nnz = h->non_zero_count_cache[ scan8[i] ];
2492                                 h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2493                                                             (h->topright_samples_available<<i)&0x4000, linesize);
2494                                 if(nnz){
2495                                     if(nnz == 1 && h->mb[i*16])
2496                                         idct_dc_add(ptr, h->mb + i*16, linesize);
2497                                     else
2498                                         idct_add   (ptr, h->mb + i*16, linesize);
2499                                 }
2500                             }
2501                         }
2502                     }else{
2503                         if(transform_bypass){
2504                             idct_dc_add =
2505                             idct_add    = s->dsp.add_pixels4;
2506                         }else{
2507                             idct_dc_add = s->dsp.h264_idct_dc_add;
2508                             idct_add    = s->dsp.h264_idct_add;
2509                         }
2510                         for(i=0; i<16; i++){
2511                             uint8_t * const ptr= dest_y + block_offset[i];
2512                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2513
2514                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2515                                 h->hpc.pred4x4_add[dir](ptr, h->mb + i*16, linesize);
2516                             }else{
2517                                 uint8_t *topright;
2518                                 int nnz, tr;
2519                                 if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2520                                     const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2521                                     assert(mb_y || linesize <= block_offset[i]);
2522                                     if(!topright_avail){
2523                                         tr= ptr[3 - linesize]*0x01010101;
2524                                         topright= (uint8_t*) &tr;
2525                                     }else
2526                                         topright= ptr + 4 - linesize;
2527                                 }else
2528                                     topright= NULL;
2529
2530                                 h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2531                                 nnz = h->non_zero_count_cache[ scan8[i] ];
2532                                 if(nnz){
2533                                     if(is_h264){
2534                                         if(nnz == 1 && h->mb[i*16])
2535                                             idct_dc_add(ptr, h->mb + i*16, linesize);
2536                                         else
2537                                             idct_add   (ptr, h->mb + i*16, linesize);
2538                                     }else
2539                                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2540                                 }
2541                             }
2542                         }
2543                     }
2544                 }
2545             }else{
2546                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2547                 if(is_h264){
2548                     if(!transform_bypass)
2549                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2550                 }else
2551                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2552             }
2553             if(h->deblocking_filter)
2554                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2555         }else if(is_h264){
2556             hl_motion(h, dest_y, dest_cb, dest_cr,
2557                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2558                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2559                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2560         }
2561
2562
2563         if(!IS_INTRA4x4(mb_type)){
2564             if(is_h264){
2565                 if(IS_INTRA16x16(mb_type)){
2566                     if(transform_bypass){
2567                         if(h->sps.profile_idc==244 && (h->intra16x16_pred_mode==VERT_PRED8x8 || h->intra16x16_pred_mode==HOR_PRED8x8)){
2568                             h->hpc.pred16x16_add[h->intra16x16_pred_mode](dest_y, block_offset, h->mb, linesize);
2569                         }else{
2570                             for(i=0; i<16; i++){
2571                                 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2572                                     s->dsp.add_pixels4(dest_y + block_offset[i], h->mb + i*16, linesize);
2573                             }
2574                         }
2575                     }else{
2576                          s->dsp.h264_idct_add16intra(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2577                     }
2578                 }else if(h->cbp&15){
2579                     if(transform_bypass){
2580                         const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2581                         idct_add= IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2582                         for(i=0; i<16; i+=di){
2583                             if(h->non_zero_count_cache[ scan8[i] ]){
2584                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2585                             }
2586                         }
2587                     }else{
2588                         if(IS_8x8DCT(mb_type)){
2589                             s->dsp.h264_idct8_add4(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2590                         }else{
2591                             s->dsp.h264_idct_add16(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2592                         }
2593                     }
2594                 }
2595             }else{
2596                 for(i=0; i<16; i++){
2597                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2598                         uint8_t * const ptr= dest_y + block_offset[i];
2599                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2600                     }
2601                 }
2602             }
2603         }
2604
2605         if((simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)) && (h->cbp&0x30)){
2606             uint8_t *dest[2] = {dest_cb, dest_cr};
2607             if(transform_bypass){
2608                 if(IS_INTRA(mb_type) && h->sps.profile_idc==244 && (h->chroma_pred_mode==VERT_PRED8x8 || h->chroma_pred_mode==HOR_PRED8x8)){
2609                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0], block_offset + 16, h->mb + 16*16, uvlinesize);
2610                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1], block_offset + 20, h->mb + 20*16, uvlinesize);
2611                 }else{
2612                     idct_add = s->dsp.add_pixels4;
2613                     for(i=16; i<16+8; i++){
2614                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2615                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2616                     }
2617                 }
2618             }else{
2619                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2620                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2621                 if(is_h264){
2622                     idct_add = s->dsp.h264_idct_add;
2623                     idct_dc_add = s->dsp.h264_idct_dc_add;
2624                     for(i=16; i<16+8; i++){
2625                         if(h->non_zero_count_cache[ scan8[i] ])
2626                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2627                         else if(h->mb[i*16])
2628                             idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2629                     }
2630                 }else{
2631                     for(i=16; i<16+8; i++){
2632                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2633                             uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2634                             svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2635                         }
2636                     }
2637                 }
2638             }
2639         }
2640     }
2641     if(h->cbp || IS_INTRA(mb_type))
2642         s->dsp.clear_blocks(h->mb);
2643
2644     if(h->deblocking_filter) {
2645         backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2646         fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2647         h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2648         h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2649         if (!simple && FRAME_MBAFF) {
2650             filter_mb     (h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2651         } else {
2652             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2653         }
2654     }
2655 }
2656
2657 /**
2658  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2659  */
2660 static void hl_decode_mb_simple(H264Context *h){
2661     hl_decode_mb_internal(h, 1);
2662 }
2663
2664 /**
2665  * Process a macroblock; this handles edge cases, such as interlacing.
2666  */
2667 static void av_noinline hl_decode_mb_complex(H264Context *h){
2668     hl_decode_mb_internal(h, 0);
2669 }
2670
2671 static void hl_decode_mb(H264Context *h){
2672     MpegEncContext * const s = &h->s;
2673     const int mb_xy= h->mb_xy;
2674     const int mb_type= s->current_picture.mb_type[mb_xy];
2675     int is_complex = ENABLE_SMALL || h->is_complex || IS_INTRA_PCM(mb_type) || s->qscale == 0;
2676
2677     if(ENABLE_H264_ENCODER && !s->decode)
2678         return;
2679
2680     if (is_complex)
2681         hl_decode_mb_complex(h);
2682     else hl_decode_mb_simple(h);
2683 }
2684
2685 static void pic_as_field(Picture *pic, const int parity){
2686     int i;
2687     for (i = 0; i < 4; ++i) {
2688         if (parity == PICT_BOTTOM_FIELD)
2689             pic->data[i] += pic->linesize[i];
2690         pic->reference = parity;
2691         pic->linesize[i] *= 2;
2692     }
2693     pic->poc= pic->field_poc[parity == PICT_BOTTOM_FIELD];
2694 }
2695
2696 static int split_field_copy(Picture *dest, Picture *src,
2697                             int parity, int id_add){
2698     int match = !!(src->reference & parity);
2699
2700     if (match) {
2701         *dest = *src;
2702         if(parity != PICT_FRAME){
2703             pic_as_field(dest, parity);
2704             dest->pic_id *= 2;
2705             dest->pic_id += id_add;
2706         }
2707     }
2708
2709     return match;
2710 }
2711
2712 static int build_def_list(Picture *def, Picture **in, int len, int is_long, int sel){
2713     int i[2]={0};
2714     int index=0;
2715
2716     while(i[0]<len || i[1]<len){
2717         while(i[0]<len && !(in[ i[0] ] && (in[ i[0] ]->reference & sel)))
2718             i[0]++;
2719         while(i[1]<len && !(in[ i[1] ] && (in[ i[1] ]->reference & (sel^3))))
2720             i[1]++;
2721         if(i[0] < len){
2722             in[ i[0] ]->pic_id= is_long ? i[0] : in[ i[0] ]->frame_num;
2723             split_field_copy(&def[index++], in[ i[0]++ ], sel  , 1);
2724         }
2725         if(i[1] < len){
2726             in[ i[1] ]->pic_id= is_long ? i[1] : in[ i[1] ]->frame_num;
2727             split_field_copy(&def[index++], in[ i[1]++ ], sel^3, 0);
2728         }
2729     }
2730
2731     return index;
2732 }
2733
2734 static int add_sorted(Picture **sorted, Picture **src, int len, int limit, int dir){
2735     int i, best_poc;
2736     int out_i= 0;
2737
2738     for(;;){
2739         best_poc= dir ? INT_MIN : INT_MAX;
2740
2741         for(i=0; i<len; i++){
2742             const int poc= src[i]->poc;
2743             if(((poc > limit) ^ dir) && ((poc < best_poc) ^ dir)){
2744                 best_poc= poc;
2745                 sorted[out_i]= src[i];
2746             }
2747         }
2748         if(best_poc == (dir ? INT_MIN : INT_MAX))
2749             break;
2750         limit= sorted[out_i++]->poc - dir;
2751     }
2752     return out_i;
2753 }
2754
2755 /**
2756  * fills the default_ref_list.
2757  */
2758 static int fill_default_ref_list(H264Context *h){
2759     MpegEncContext * const s = &h->s;
2760     int i, len;
2761
2762     if(h->slice_type_nos==FF_B_TYPE){
2763         Picture *sorted[32];
2764         int cur_poc, list;
2765         int lens[2];
2766
2767         if(FIELD_PICTURE)
2768             cur_poc= s->current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
2769         else
2770             cur_poc= s->current_picture_ptr->poc;
2771
2772         for(list= 0; list<2; list++){
2773             len= add_sorted(sorted    , h->short_ref, h->short_ref_count, cur_poc, 1^list);
2774             len+=add_sorted(sorted+len, h->short_ref, h->short_ref_count, cur_poc, 0^list);
2775             assert(len<=32);
2776             len= build_def_list(h->default_ref_list[list]    , sorted     , len, 0, s->picture_structure);
2777             len+=build_def_list(h->default_ref_list[list]+len, h->long_ref, 16 , 1, s->picture_structure);
2778             assert(len<=32);
2779
2780             if(len < h->ref_count[list])
2781                 memset(&h->default_ref_list[list][len], 0, sizeof(Picture)*(h->ref_count[list] - len));
2782             lens[list]= len;
2783         }
2784
2785         if(lens[0] == lens[1] && lens[1] > 1){
2786             for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0] && i<lens[0]; i++);
2787             if(i == lens[0])
2788                 FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2789         }
2790     }else{
2791         len = build_def_list(h->default_ref_list[0]    , h->short_ref, h->short_ref_count, 0, s->picture_structure);
2792         len+= build_def_list(h->default_ref_list[0]+len, h-> long_ref, 16                , 1, s->picture_structure);
2793         assert(len <= 32);
2794         if(len < h->ref_count[0])
2795             memset(&h->default_ref_list[0][len], 0, sizeof(Picture)*(h->ref_count[0] - len));
2796     }
2797 #ifdef TRACE
2798     for (i=0; i<h->ref_count[0]; i++) {
2799         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2800     }
2801     if(h->slice_type_nos==FF_B_TYPE){
2802         for (i=0; i<h->ref_count[1]; i++) {
2803             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
2804         }
2805     }
2806 #endif
2807     return 0;
2808 }
2809
2810 static void print_short_term(H264Context *h);
2811 static void print_long_term(H264Context *h);
2812
2813 /**
2814  * Extract structure information about the picture described by pic_num in
2815  * the current decoding context (frame or field). Note that pic_num is
2816  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
2817  * @param pic_num picture number for which to extract structure information
2818  * @param structure one of PICT_XXX describing structure of picture
2819  *                      with pic_num
2820  * @return frame number (short term) or long term index of picture
2821  *         described by pic_num
2822  */
2823 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
2824     MpegEncContext * const s = &h->s;
2825
2826     *structure = s->picture_structure;
2827     if(FIELD_PICTURE){
2828         if (!(pic_num & 1))
2829             /* opposite field */
2830             *structure ^= PICT_FRAME;
2831         pic_num >>= 1;
2832     }
2833
2834     return pic_num;
2835 }
2836
2837 static int decode_ref_pic_list_reordering(H264Context *h){
2838     MpegEncContext * const s = &h->s;
2839     int list, index, pic_structure;
2840
2841     print_short_term(h);
2842     print_long_term(h);
2843
2844     for(list=0; list<h->list_count; list++){
2845         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
2846
2847         if(get_bits1(&s->gb)){
2848             int pred= h->curr_pic_num;
2849
2850             for(index=0; ; index++){
2851                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb_31(&s->gb);
2852                 unsigned int pic_id;
2853                 int i;
2854                 Picture *ref = NULL;
2855
2856                 if(reordering_of_pic_nums_idc==3)
2857                     break;
2858
2859                 if(index >= h->ref_count[list]){
2860                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
2861                     return -1;
2862                 }
2863
2864                 if(reordering_of_pic_nums_idc<3){
2865                     if(reordering_of_pic_nums_idc<2){
2866                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
2867                         int frame_num;
2868
2869                         if(abs_diff_pic_num > h->max_pic_num){
2870                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
2871                             return -1;
2872                         }
2873
2874                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
2875                         else                                pred+= abs_diff_pic_num;
2876                         pred &= h->max_pic_num - 1;
2877
2878                         frame_num = pic_num_extract(h, pred, &pic_structure);
2879
2880                         for(i= h->short_ref_count-1; i>=0; i--){
2881                             ref = h->short_ref[i];
2882                             assert(ref->reference);
2883                             assert(!ref->long_ref);
2884                             if(
2885                                    ref->frame_num == frame_num &&
2886                                    (ref->reference & pic_structure)
2887                               )
2888                                 break;
2889                         }
2890                         if(i>=0)
2891                             ref->pic_id= pred;
2892                     }else{
2893                         int long_idx;
2894                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
2895
2896                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
2897
2898                         if(long_idx>31){
2899                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
2900                             return -1;
2901                         }
2902                         ref = h->long_ref[long_idx];
2903                         assert(!(ref && !ref->reference));
2904                         if(ref && (ref->reference & pic_structure)){
2905                             ref->pic_id= pic_id;
2906                             assert(ref->long_ref);
2907                             i=0;
2908                         }else{
2909                             i=-1;
2910                         }
2911                     }
2912
2913                     if (i < 0) {
2914                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
2915                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
2916                     } else {
2917                         for(i=index; i+1<h->ref_count[list]; i++){
2918                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
2919                                 break;
2920                         }
2921                         for(; i > index; i--){
2922                             h->ref_list[list][i]= h->ref_list[list][i-1];
2923                         }
2924                         h->ref_list[list][index]= *ref;
2925                         if (FIELD_PICTURE){
2926                             pic_as_field(&h->ref_list[list][index], pic_structure);
2927                         }
2928                     }
2929                 }else{
2930                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
2931                     return -1;
2932                 }
2933             }
2934         }
2935     }
2936     for(list=0; list<h->list_count; list++){
2937         for(index= 0; index < h->ref_count[list]; index++){
2938             if(!h->ref_list[list][index].data[0]){
2939                 av_log(h->s.avctx, AV_LOG_ERROR, "Missing reference picture\n");
2940                 h->ref_list[list][index]= s->current_picture; //FIXME this is not a sensible solution
2941             }
2942         }
2943     }
2944
2945     return 0;
2946 }
2947
2948 static void fill_mbaff_ref_list(H264Context *h){
2949     int list, i, j;
2950     for(list=0; list<2; list++){ //FIXME try list_count
2951         for(i=0; i<h->ref_count[list]; i++){
2952             Picture *frame = &h->ref_list[list][i];
2953             Picture *field = &h->ref_list[list][16+2*i];
2954             field[0] = *frame;
2955             for(j=0; j<3; j++)
2956                 field[0].linesize[j] <<= 1;
2957             field[0].reference = PICT_TOP_FIELD;
2958             field[0].poc= field[0].field_poc[0];
2959             field[1] = field[0];
2960             for(j=0; j<3; j++)
2961                 field[1].data[j] += frame->linesize[j];
2962             field[1].reference = PICT_BOTTOM_FIELD;
2963             field[1].poc= field[1].field_poc[1];
2964
2965             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
2966             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
2967             for(j=0; j<2; j++){
2968                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
2969                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
2970             }
2971         }
2972     }
2973     for(j=0; j<h->ref_count[1]; j++){
2974         for(i=0; i<h->ref_count[0]; i++)
2975             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
2976         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
2977         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
2978     }
2979 }
2980
2981 static int pred_weight_table(H264Context *h){
2982     MpegEncContext * const s = &h->s;
2983     int list, i;
2984     int luma_def, chroma_def;
2985
2986     h->use_weight= 0;
2987     h->use_weight_chroma= 0;
2988     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
2989     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
2990     luma_def = 1<<h->luma_log2_weight_denom;
2991     chroma_def = 1<<h->chroma_log2_weight_denom;
2992
2993     for(list=0; list<2; list++){
2994         for(i=0; i<h->ref_count[list]; i++){
2995             int luma_weight_flag, chroma_weight_flag;
2996
2997             luma_weight_flag= get_bits1(&s->gb);
2998             if(luma_weight_flag){
2999                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3000                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3001                 if(   h->luma_weight[list][i] != luma_def
3002                    || h->luma_offset[list][i] != 0)
3003                     h->use_weight= 1;
3004             }else{
3005                 h->luma_weight[list][i]= luma_def;
3006                 h->luma_offset[list][i]= 0;
3007             }
3008
3009             if(CHROMA){
3010                 chroma_weight_flag= get_bits1(&s->gb);
3011                 if(chroma_weight_flag){
3012                     int j;
3013                     for(j=0; j<2; j++){
3014                         h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3015                         h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3016                         if(   h->chroma_weight[list][i][j] != chroma_def
3017                         || h->chroma_offset[list][i][j] != 0)
3018                             h->use_weight_chroma= 1;
3019                     }
3020                 }else{
3021                     int j;
3022                     for(j=0; j<2; j++){
3023                         h->chroma_weight[list][i][j]= chroma_def;
3024                         h->chroma_offset[list][i][j]= 0;
3025                     }
3026                 }
3027             }
3028         }
3029         if(h->slice_type_nos != FF_B_TYPE) break;
3030     }
3031     h->use_weight= h->use_weight || h->use_weight_chroma;
3032     return 0;
3033 }
3034
3035 static void implicit_weight_table(H264Context *h){
3036     MpegEncContext * const s = &h->s;
3037     int ref0, ref1;
3038     int cur_poc = s->current_picture_ptr->poc;
3039
3040     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3041        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3042         h->use_weight= 0;
3043         h->use_weight_chroma= 0;
3044         return;
3045     }
3046
3047     h->use_weight= 2;
3048     h->use_weight_chroma= 2;
3049     h->luma_log2_weight_denom= 5;
3050     h->chroma_log2_weight_denom= 5;
3051
3052     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3053         int poc0 = h->ref_list[0][ref0].poc;
3054         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3055             int poc1 = h->ref_list[1][ref1].poc;
3056             int td = av_clip(poc1 - poc0, -128, 127);
3057             if(td){
3058                 int tb = av_clip(cur_poc - poc0, -128, 127);
3059                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3060                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3061                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3062                     h->implicit_weight[ref0][ref1] = 32;
3063                 else
3064                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3065             }else
3066                 h->implicit_weight[ref0][ref1] = 32;
3067         }
3068     }
3069 }
3070
3071 /**
3072  * Mark a picture as no longer needed for reference. The refmask
3073  * argument allows unreferencing of individual fields or the whole frame.
3074  * If the picture becomes entirely unreferenced, but is being held for
3075  * display purposes, it is marked as such.
3076  * @param refmask mask of fields to unreference; the mask is bitwise
3077  *                anded with the reference marking of pic
3078  * @return non-zero if pic becomes entirely unreferenced (except possibly
3079  *         for display purposes) zero if one of the fields remains in
3080  *         reference
3081  */
3082 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3083     int i;
3084     if (pic->reference &= refmask) {
3085         return 0;
3086     } else {
3087         for(i = 0; h->delayed_pic[i]; i++)
3088             if(pic == h->delayed_pic[i]){
3089                 pic->reference=DELAYED_PIC_REF;
3090                 break;
3091             }
3092         return 1;
3093     }
3094 }
3095
3096 /**
3097  * instantaneous decoder refresh.
3098  */
3099 static void idr(H264Context *h){
3100     int i;
3101
3102     for(i=0; i<16; i++){
3103         remove_long(h, i, 0);
3104     }
3105     assert(h->long_ref_count==0);
3106
3107     for(i=0; i<h->short_ref_count; i++){
3108         unreference_pic(h, h->short_ref[i], 0);
3109         h->short_ref[i]= NULL;
3110     }
3111     h->short_ref_count=0;
3112     h->prev_frame_num= 0;
3113     h->prev_frame_num_offset= 0;
3114     h->prev_poc_msb=
3115     h->prev_poc_lsb= 0;
3116 }
3117
3118 /* forget old pics after a seek */
3119 static void flush_dpb(AVCodecContext *avctx){
3120     H264Context *h= avctx->priv_data;
3121     int i;
3122     for(i=0; i<MAX_DELAYED_PIC_COUNT; i++) {
3123         if(h->delayed_pic[i])
3124             h->delayed_pic[i]->reference= 0;
3125         h->delayed_pic[i]= NULL;
3126     }
3127     h->outputed_poc= INT_MIN;
3128     idr(h);
3129     if(h->s.current_picture_ptr)
3130         h->s.current_picture_ptr->reference= 0;
3131     h->s.first_field= 0;
3132     ff_mpeg_flush(avctx);
3133 }
3134
3135 /**
3136  * Find a Picture in the short term reference list by frame number.
3137  * @param frame_num frame number to search for
3138  * @param idx the index into h->short_ref where returned picture is found
3139  *            undefined if no picture found.
3140  * @return pointer to the found picture, or NULL if no pic with the provided
3141  *                 frame number is found
3142  */
3143 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3144     MpegEncContext * const s = &h->s;
3145     int i;
3146
3147     for(i=0; i<h->short_ref_count; i++){
3148         Picture *pic= h->short_ref[i];
3149         if(s->avctx->debug&FF_DEBUG_MMCO)
3150             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3151         if(pic->frame_num == frame_num) {
3152             *idx = i;
3153             return pic;
3154         }
3155     }
3156     return NULL;
3157 }
3158
3159 /**
3160  * Remove a picture from the short term reference list by its index in
3161  * that list.  This does no checking on the provided index; it is assumed
3162  * to be valid. Other list entries are shifted down.
3163  * @param i index into h->short_ref of picture to remove.
3164  */
3165 static void remove_short_at_index(H264Context *h, int i){
3166     assert(i >= 0 && i < h->short_ref_count);
3167     h->short_ref[i]= NULL;
3168     if (--h->short_ref_count)
3169         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3170 }
3171
3172 /**
3173  *
3174  * @return the removed picture or NULL if an error occurs
3175  */
3176 static Picture * remove_short(H264Context *h, int frame_num, int ref_mask){
3177     MpegEncContext * const s = &h->s;
3178     Picture *pic;
3179     int i;
3180
3181     if(s->avctx->debug&FF_DEBUG_MMCO)
3182         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3183
3184     pic = find_short(h, frame_num, &i);
3185     if (pic){
3186         if(unreference_pic(h, pic, ref_mask))
3187         remove_short_at_index(h, i);
3188     }
3189
3190     return pic;
3191 }
3192
3193 /**
3194  * Remove a picture from the long term reference list by its index in
3195  * that list.
3196  * @return the removed picture or NULL if an error occurs
3197  */
3198 static Picture * remove_long(H264Context *h, int i, int ref_mask){
3199     Picture *pic;
3200
3201     pic= h->long_ref[i];
3202     if (pic){
3203         if(unreference_pic(h, pic, ref_mask)){
3204             assert(h->long_ref[i]->long_ref == 1);
3205             h->long_ref[i]->long_ref= 0;
3206             h->long_ref[i]= NULL;
3207             h->long_ref_count--;
3208         }
3209     }
3210
3211     return pic;
3212 }
3213
3214 /**
3215  * print short term list
3216  */
3217 static void print_short_term(H264Context *h) {
3218     uint32_t i;
3219     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3220         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3221         for(i=0; i<h->short_ref_count; i++){
3222             Picture *pic= h->short_ref[i];
3223             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3224         }
3225     }
3226 }
3227
3228 /**
3229  * print long term list
3230  */
3231 static void print_long_term(H264Context *h) {
3232     uint32_t i;
3233     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3234         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3235         for(i = 0; i < 16; i++){
3236             Picture *pic= h->long_ref[i];
3237             if (pic) {
3238                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3239             }
3240         }
3241     }
3242 }
3243
3244 /**
3245  * Executes the reference picture marking (memory management control operations).
3246  */
3247 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3248     MpegEncContext * const s = &h->s;
3249     int i, j;
3250     int current_ref_assigned=0;
3251     Picture *pic;
3252
3253     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3254         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3255
3256     for(i=0; i<mmco_count; i++){
3257         int structure, frame_num;
3258         if(s->avctx->debug&FF_DEBUG_MMCO)
3259             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3260
3261         if(   mmco[i].opcode == MMCO_SHORT2UNUSED
3262            || mmco[i].opcode == MMCO_SHORT2LONG){
3263             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3264             pic = find_short(h, frame_num, &j);
3265             if(!pic){
3266                 if(mmco[i].opcode != MMCO_SHORT2LONG || !h->long_ref[mmco[i].long_arg]
3267                    || h->long_ref[mmco[i].long_arg]->frame_num != frame_num)
3268                 av_log(h->s.avctx, AV_LOG_ERROR, "mmco: unref short failure\n");
3269                 continue;
3270             }
3271         }
3272
3273         switch(mmco[i].opcode){
3274         case MMCO_SHORT2UNUSED:
3275             if(s->avctx->debug&FF_DEBUG_MMCO)
3276                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3277             remove_short(h, frame_num, structure ^ PICT_FRAME);
3278             break;
3279         case MMCO_SHORT2LONG:
3280                 if (h->long_ref[mmco[i].long_arg] != pic)
3281                     remove_long(h, mmco[i].long_arg, 0);
3282
3283                 remove_short_at_index(h, j);
3284                 h->long_ref[ mmco[i].long_arg ]= pic;
3285                 if (h->long_ref[ mmco[i].long_arg ]){
3286                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3287                     h->long_ref_count++;
3288                 }
3289             break;
3290         case MMCO_LONG2UNUSED:
3291             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3292             pic = h->long_ref[j];
3293             if (pic) {
3294                 remove_long(h, j, structure ^ PICT_FRAME);
3295             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3296                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3297             break;
3298         case MMCO_LONG:
3299                     // Comment below left from previous code as it is an interresting note.
3300                     /* First field in pair is in short term list or
3301                      * at a different long term index.
3302                      * This is not allowed; see 7.4.3.3, notes 2 and 3.
3303                      * Report the problem and keep the pair where it is,
3304                      * and mark this field valid.
3305                      */
3306
3307             if (h->long_ref[mmco[i].long_arg] != s->current_picture_ptr) {
3308                 remove_long(h, mmco[i].long_arg, 0);
3309
3310                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3311                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3312                 h->long_ref_count++;
3313             }
3314
3315             s->current_picture_ptr->reference |= s->picture_structure;
3316             current_ref_assigned=1;
3317             break;
3318         case MMCO_SET_MAX_LONG:
3319             assert(mmco[i].long_arg <= 16);
3320             // just remove the long term which index is greater than new max
3321             for(j = mmco[i].long_arg; j<16; j++){
3322                 remove_long(h, j, 0);
3323             }
3324             break;
3325         case MMCO_RESET:
3326             while(h->short_ref_count){
3327                 remove_short(h, h->short_ref[0]->frame_num, 0);
3328             }
3329             for(j = 0; j < 16; j++) {
3330                 remove_long(h, j, 0);
3331             }
3332             s->current_picture_ptr->poc=
3333             s->current_picture_ptr->field_poc[0]=
3334             s->current_picture_ptr->field_poc[1]=
3335             h->poc_lsb=
3336             h->poc_msb=
3337             h->frame_num=
3338             s->current_picture_ptr->frame_num= 0;
3339             break;
3340         default: assert(0);
3341         }
3342     }
3343
3344     if (!current_ref_assigned) {
3345         /* Second field of complementary field pair; the first field of
3346          * which is already referenced. If short referenced, it
3347          * should be first entry in short_ref. If not, it must exist
3348          * in long_ref; trying to put it on the short list here is an
3349          * error in the encoded bit stream (ref: 7.4.3.3, NOTE 2 and 3).
3350          */
3351         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3352             /* Just mark the second field valid */
3353             s->current_picture_ptr->reference = PICT_FRAME;
3354         } else if (s->current_picture_ptr->long_ref) {
3355             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3356                                              "assignment for second field "
3357                                              "in complementary field pair "
3358                                              "(first field is long term)\n");
3359         } else {
3360             pic= remove_short(h, s->current_picture_ptr->frame_num, 0);
3361             if(pic){
3362                 av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3363             }
3364
3365             if(h->short_ref_count)
3366                 memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3367
3368             h->short_ref[0]= s->current_picture_ptr;
3369             h->short_ref_count++;
3370             s->current_picture_ptr->reference |= s->picture_structure;
3371         }
3372     }
3373
3374     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3375
3376         /* We have too many reference frames, probably due to corrupted
3377          * stream. Need to discard one frame. Prevents overrun of the
3378          * short_ref and long_ref buffers.
3379          */
3380         av_log(h->s.avctx, AV_LOG_ERROR,
3381                "number of reference frames exceeds max (probably "
3382                "corrupt input), discarding one\n");
3383
3384         if (h->long_ref_count && !h->short_ref_count) {
3385             for (i = 0; i < 16; ++i)
3386                 if (h->long_ref[i])
3387                     break;
3388
3389             assert(i < 16);
3390             remove_long(h, i, 0);
3391         } else {
3392             pic = h->short_ref[h->short_ref_count - 1];
3393             remove_short(h, pic->frame_num, 0);
3394         }
3395     }
3396
3397     print_short_term(h);
3398     print_long_term(h);
3399     return 0;
3400 }
3401
3402 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3403     MpegEncContext * const s = &h->s;
3404     int i;
3405
3406     h->mmco_index= 0;
3407     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3408         s->broken_link= get_bits1(gb) -1;
3409         if(get_bits1(gb)){
3410             h->mmco[0].opcode= MMCO_LONG;
3411             h->mmco[0].long_arg= 0;
3412             h->mmco_index= 1;
3413         }
3414     }else{
3415         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3416             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3417                 MMCOOpcode opcode= get_ue_golomb_31(gb);
3418
3419                 h->mmco[i].opcode= opcode;
3420                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3421                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3422 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3423                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3424                         return -1;
3425                     }*/
3426                 }
3427                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3428                     unsigned int long_arg= get_ue_golomb_31(gb);
3429                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3430                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3431                         return -1;
3432                     }
3433                     h->mmco[i].long_arg= long_arg;
3434                 }
3435
3436                 if(opcode > (unsigned)MMCO_LONG){
3437                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3438                     return -1;
3439                 }
3440                 if(opcode == MMCO_END)
3441                     break;
3442             }
3443             h->mmco_index= i;
3444         }else{
3445             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3446
3447             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3448                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3449                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3450                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3451                 h->mmco_index= 1;
3452                 if (FIELD_PICTURE) {
3453                     h->mmco[0].short_pic_num *= 2;
3454                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3455                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3456                     h->mmco_index= 2;
3457                 }
3458             }
3459         }
3460     }
3461
3462     return 0;
3463 }
3464
3465 static int init_poc(H264Context *h){
3466     MpegEncContext * const s = &h->s;
3467     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3468     int field_poc[2];
3469     Picture *cur = s->current_picture_ptr;
3470
3471     h->frame_num_offset= h->prev_frame_num_offset;
3472     if(h->frame_num < h->prev_frame_num)
3473         h->frame_num_offset += max_frame_num;
3474
3475     if(h->sps.poc_type==0){
3476         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3477
3478         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3479             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3480         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3481             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3482         else
3483             h->poc_msb = h->prev_poc_msb;
3484 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3485         field_poc[0] =
3486         field_poc[1] = h->poc_msb + h->poc_lsb;
3487         if(s->picture_structure == PICT_FRAME)
3488             field_poc[1] += h->delta_poc_bottom;
3489     }else if(h->sps.poc_type==1){
3490         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3491         int i;
3492
3493         if(h->sps.poc_cycle_length != 0)
3494             abs_frame_num = h->frame_num_offset + h->frame_num;
3495         else
3496             abs_frame_num = 0;
3497
3498         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3499             abs_frame_num--;
3500
3501         expected_delta_per_poc_cycle = 0;
3502         for(i=0; i < h->sps.poc_cycle_length; i++)
3503             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3504
3505         if(abs_frame_num > 0){
3506             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3507             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3508
3509             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3510             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3511                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3512         } else
3513             expectedpoc = 0;
3514
3515         if(h->nal_ref_idc == 0)
3516             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3517
3518         field_poc[0] = expectedpoc + h->delta_poc[0];
3519         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3520
3521         if(s->picture_structure == PICT_FRAME)
3522             field_poc[1] += h->delta_poc[1];
3523     }else{
3524         int poc= 2*(h->frame_num_offset + h->frame_num);
3525
3526         if(!h->nal_ref_idc)
3527             poc--;
3528
3529         field_poc[0]= poc;
3530         field_poc[1]= poc;
3531     }
3532
3533     if(s->picture_structure != PICT_BOTTOM_FIELD)
3534         s->current_picture_ptr->field_poc[0]= field_poc[0];
3535     if(s->picture_structure != PICT_TOP_FIELD)
3536         s->current_picture_ptr->field_poc[1]= field_poc[1];
3537     cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3538
3539     return 0;
3540 }
3541
3542
3543 /**
3544  * initialize scan tables
3545  */
3546 static void init_scan_tables(H264Context *h){
3547     MpegEncContext * const s = &h->s;
3548     int i;
3549     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3550         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3551         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3552     }else{
3553         for(i=0; i<16; i++){
3554 #define T(x) (x>>2) | ((x<<2) & 0xF)
3555             h->zigzag_scan[i] = T(zigzag_scan[i]);
3556             h-> field_scan[i] = T( field_scan[i]);
3557 #undef T
3558         }
3559     }
3560     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3561         memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
3562         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3563         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3564         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3565     }else{
3566         for(i=0; i<64; i++){
3567 #define T(x) (x>>3) | ((x&7)<<3)
3568             h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
3569             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3570             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3571             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3572 #undef T
3573         }
3574     }
3575     if(h->sps.transform_bypass){ //FIXME same ugly
3576         h->zigzag_scan_q0          = zigzag_scan;
3577         h->zigzag_scan8x8_q0       = zigzag_scan8x8;
3578         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3579         h->field_scan_q0           = field_scan;
3580         h->field_scan8x8_q0        = field_scan8x8;
3581         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3582     }else{
3583         h->zigzag_scan_q0          = h->zigzag_scan;
3584         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3585         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3586         h->field_scan_q0           = h->field_scan;
3587         h->field_scan8x8_q0        = h->field_scan8x8;
3588         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3589     }
3590 }
3591
3592 /**
3593  * Replicates H264 "master" context to thread contexts.
3594  */
3595 static void clone_slice(H264Context *dst, H264Context *src)
3596 {
3597     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3598     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3599     dst->s.current_picture      = src->s.current_picture;
3600     dst->s.linesize             = src->s.linesize;
3601     dst->s.uvlinesize           = src->s.uvlinesize;
3602     dst->s.first_field          = src->s.first_field;
3603
3604     dst->prev_poc_msb           = src->prev_poc_msb;
3605     dst->prev_poc_lsb           = src->prev_poc_lsb;
3606     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3607     dst->prev_frame_num         = src->prev_frame_num;
3608     dst->short_ref_count        = src->short_ref_count;
3609
3610     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3611     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3612     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3613     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3614
3615     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3616     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3617 }
3618
3619 /**
3620  * decodes a slice header.
3621  * This will also call MPV_common_init() and frame_start() as needed.
3622  *
3623  * @param h h264context
3624  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3625  *
3626  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
3627  */
3628 static int decode_slice_header(H264Context *h, H264Context *h0){
3629     MpegEncContext * const s = &h->s;
3630     MpegEncContext * const s0 = &h0->s;
3631     unsigned int first_mb_in_slice;
3632     unsigned int pps_id;
3633     int num_ref_idx_active_override_flag;
3634     unsigned int slice_type, tmp, i, j;
3635     int default_ref_list_done = 0;
3636     int last_pic_structure;
3637
3638     s->dropable= h->nal_ref_idc == 0;
3639
3640     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3641         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3642         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3643     }else{
3644         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3645         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3646     }
3647
3648     first_mb_in_slice= get_ue_golomb(&s->gb);
3649
3650     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3651         h0->current_slice = 0;
3652         if (!s0->first_field)
3653             s->current_picture_ptr= NULL;
3654     }
3655
3656     slice_type= get_ue_golomb_31(&s->gb);
3657     if(slice_type > 9){
3658         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3659         return -1;
3660     }
3661     if(slice_type > 4){
3662         slice_type -= 5;
3663         h->slice_type_fixed=1;
3664     }else
3665         h->slice_type_fixed=0;
3666
3667     slice_type= golomb_to_pict_type[ slice_type ];
3668     if (slice_type == FF_I_TYPE
3669         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3670         default_ref_list_done = 1;
3671     }
3672     h->slice_type= slice_type;
3673     h->slice_type_nos= slice_type & 3;
3674
3675     s->pict_type= h->slice_type; // to make a few old functions happy, it's wrong though
3676     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3677         av_log(h->s.avctx, AV_LOG_ERROR,
3678                "B picture before any references, skipping\n");
3679         return -1;
3680     }
3681
3682     pps_id= get_ue_golomb(&s->gb);
3683     if(pps_id>=MAX_PPS_COUNT){
3684         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3685         return -1;
3686     }
3687     if(!h0->pps_buffers[pps_id]) {
3688         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing PPS referenced\n");
3689         return -1;
3690     }
3691     h->pps= *h0->pps_buffers[pps_id];
3692
3693     if(!h0->sps_buffers[h->pps.sps_id]) {
3694         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS referenced\n");
3695         return -1;
3696     }
3697     h->sps = *h0->sps_buffers[h->pps.sps_id];
3698
3699     if(h == h0 && h->dequant_coeff_pps != pps_id){
3700         h->dequant_coeff_pps = pps_id;
3701         init_dequant_tables(h);
3702     }
3703
3704     s->mb_width= h->sps.mb_width;
3705     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3706
3707     h->b_stride=  s->mb_width*4;
3708     h->b8_stride= s->mb_width*2;
3709
3710     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
3711     if(h->sps.frame_mbs_only_flag)
3712         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
3713     else
3714         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 3);
3715
3716     if (s->context_initialized
3717         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3718         if(h != h0)
3719             return -1;   // width / height changed during parallelized decoding
3720         free_tables(h);
3721         flush_dpb(s->avctx);
3722         MPV_common_end(s);
3723     }
3724     if (!s->context_initialized) {
3725         if(h != h0)
3726             return -1;  // we cant (re-)initialize context during parallel decoding
3727         if (MPV_common_init(s) < 0)
3728             return -1;
3729         s->first_field = 0;
3730
3731         init_scan_tables(h);
3732         alloc_tables(h);
3733
3734         for(i = 1; i < s->avctx->thread_count; i++) {
3735             H264Context *c;
3736             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3737             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3738             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3739             c->sps = h->sps;
3740             c->pps = h->pps;
3741             init_scan_tables(c);
3742             clone_tables(c, h);
3743         }
3744
3745         for(i = 0; i < s->avctx->thread_count; i++)
3746             if(context_init(h->thread_context[i]) < 0)
3747                 return -1;
3748
3749         s->avctx->width = s->width;
3750         s->avctx->height = s->height;
3751         s->avctx->sample_aspect_ratio= h->sps.sar;
3752         if(!s->avctx->sample_aspect_ratio.den)
3753             s->avctx->sample_aspect_ratio.den = 1;
3754
3755         if(h->sps.timing_info_present_flag){
3756             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
3757             if(h->x264_build > 0 && h->x264_build < 44)
3758                 s->avctx->time_base.den *= 2;
3759             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3760                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3761         }
3762     }
3763
3764     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3765
3766     h->mb_mbaff = 0;
3767     h->mb_aff_frame = 0;
3768     last_pic_structure = s0->picture_structure;
3769     if(h->sps.frame_mbs_only_flag){
3770         s->picture_structure= PICT_FRAME;
3771     }else{
3772         if(get_bits1(&s->gb)) { //field_pic_flag
3773             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3774         } else {
3775             s->picture_structure= PICT_FRAME;
3776             h->mb_aff_frame = h->sps.mb_aff;
3777         }
3778     }
3779     h->mb_field_decoding_flag= s->picture_structure != PICT_FRAME;
3780
3781     if(h0->current_slice == 0){
3782         while(h->frame_num !=  h->prev_frame_num &&
3783               h->frame_num != (h->prev_frame_num+1)%(1<<h->sps.log2_max_frame_num)){
3784             av_log(NULL, AV_LOG_DEBUG, "Frame num gap %d %d\n", h->frame_num, h->prev_frame_num);
3785             frame_start(h);
3786             h->prev_frame_num++;
3787             h->prev_frame_num %= 1<<h->sps.log2_max_frame_num;
3788             s->current_picture_ptr->frame_num= h->prev_frame_num;
3789             execute_ref_pic_marking(h, NULL, 0);
3790         }
3791
3792         /* See if we have a decoded first field looking for a pair... */
3793         if (s0->first_field) {
3794             assert(s0->current_picture_ptr);
3795             assert(s0->current_picture_ptr->data[0]);
3796             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
3797
3798             /* figure out if we have a complementary field pair */
3799             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
3800                 /*
3801                  * Previous field is unmatched. Don't display it, but let it
3802                  * remain for reference if marked as such.
3803                  */
3804                 s0->current_picture_ptr = NULL;
3805                 s0->first_field = FIELD_PICTURE;
3806
3807             } else {
3808                 if (h->nal_ref_idc &&
3809                         s0->current_picture_ptr->reference &&
3810                         s0->current_picture_ptr->frame_num != h->frame_num) {
3811                     /*
3812                      * This and previous field were reference, but had
3813                      * different frame_nums. Consider this field first in
3814                      * pair. Throw away previous field except for reference
3815                      * purposes.
3816                      */
3817                     s0->first_field = 1;
3818                     s0->current_picture_ptr = NULL;
3819
3820                 } else {
3821                     /* Second field in complementary pair */
3822                     s0->first_field = 0;
3823                 }
3824             }
3825
3826         } else {
3827             /* Frame or first field in a potentially complementary pair */
3828             assert(!s0->current_picture_ptr);
3829             s0->first_field = FIELD_PICTURE;
3830         }
3831
3832         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
3833             s0->first_field = 0;
3834             return -1;
3835         }
3836     }
3837     if(h != h0)
3838         clone_slice(h, h0);
3839
3840     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
3841
3842     assert(s->mb_num == s->mb_width * s->mb_height);
3843     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
3844        first_mb_in_slice                    >= s->mb_num){
3845         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
3846         return -1;
3847     }
3848     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
3849     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
3850     if (s->picture_structure == PICT_BOTTOM_FIELD)
3851         s->resync_mb_y = s->mb_y = s->mb_y + 1;
3852     assert(s->mb_y < s->mb_height);
3853
3854     if(s->picture_structure==PICT_FRAME){
3855         h->curr_pic_num=   h->frame_num;
3856         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
3857     }else{
3858         h->curr_pic_num= 2*h->frame_num + 1;
3859         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
3860     }
3861
3862     if(h->nal_unit_type == NAL_IDR_SLICE){
3863         get_ue_golomb(&s->gb); /* idr_pic_id */
3864     }
3865
3866     if(h->sps.poc_type==0){
3867         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
3868
3869         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
3870             h->delta_poc_bottom= get_se_golomb(&s->gb);
3871         }
3872     }
3873
3874     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
3875         h->delta_poc[0]= get_se_golomb(&s->gb);
3876
3877         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
3878             h->delta_poc[1]= get_se_golomb(&s->gb);
3879     }
3880
3881     init_poc(h);
3882
3883     if(h->pps.redundant_pic_cnt_present){
3884         h->redundant_pic_count= get_ue_golomb(&s->gb);
3885     }
3886
3887     //set defaults, might be overridden a few lines later
3888     h->ref_count[0]= h->pps.ref_count[0];
3889     h->ref_count[1]= h->pps.ref_count[1];
3890
3891     if(h->slice_type_nos != FF_I_TYPE){
3892         if(h->slice_type_nos == FF_B_TYPE){
3893             h->direct_spatial_mv_pred= get_bits1(&s->gb);
3894         }
3895         num_ref_idx_active_override_flag= get_bits1(&s->gb);
3896
3897         if(num_ref_idx_active_override_flag){
3898             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
3899             if(h->slice_type_nos==FF_B_TYPE)
3900                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
3901
3902             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
3903                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
3904                 h->ref_count[0]= h->ref_count[1]= 1;
3905                 return -1;
3906             }
3907         }
3908         if(h->slice_type_nos == FF_B_TYPE)
3909             h->list_count= 2;
3910         else
3911             h->list_count= 1;
3912     }else
3913         h->list_count= 0;
3914
3915     if(!default_ref_list_done){
3916         fill_default_ref_list(h);
3917     }
3918
3919     if(h->slice_type_nos!=FF_I_TYPE && decode_ref_pic_list_reordering(h) < 0)
3920         return -1;
3921
3922     if(h->slice_type_nos!=FF_I_TYPE){
3923         s->last_picture_ptr= &h->ref_list[0][0];
3924         ff_copy_picture(&s->last_picture, s->last_picture_ptr);
3925     }
3926     if(h->slice_type_nos==FF_B_TYPE){
3927         s->next_picture_ptr= &h->ref_list[1][0];
3928         ff_copy_picture(&s->next_picture, s->next_picture_ptr);
3929     }
3930
3931     if(   (h->pps.weighted_pred          && h->slice_type_nos == FF_P_TYPE )
3932        ||  (h->pps.weighted_bipred_idc==1 && h->slice_type_nos== FF_B_TYPE ) )
3933         pred_weight_table(h);
3934     else if(h->pps.weighted_bipred_idc==2 && h->slice_type_nos== FF_B_TYPE)
3935         implicit_weight_table(h);
3936     else
3937         h->use_weight = 0;
3938
3939     if(h->nal_ref_idc)
3940         decode_ref_pic_marking(h0, &s->gb);
3941
3942     if(FRAME_MBAFF)
3943         fill_mbaff_ref_list(h);
3944
3945     if(h->slice_type_nos==FF_B_TYPE && !h->direct_spatial_mv_pred)
3946         direct_dist_scale_factor(h);
3947     direct_ref_list_init(h);
3948
3949     if( h->slice_type_nos != FF_I_TYPE && h->pps.cabac ){
3950         tmp = get_ue_golomb_31(&s->gb);
3951         if(tmp > 2){
3952             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
3953             return -1;
3954         }
3955         h->cabac_init_idc= tmp;
3956     }
3957
3958     h->last_qscale_diff = 0;
3959     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
3960     if(tmp>51){
3961         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
3962         return -1;
3963     }
3964     s->qscale= tmp;
3965     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
3966     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
3967     //FIXME qscale / qp ... stuff
3968     if(h->slice_type == FF_SP_TYPE){
3969         get_bits1(&s->gb); /* sp_for_switch_flag */
3970     }
3971     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
3972         get_se_golomb(&s->gb); /* slice_qs_delta */
3973     }
3974
3975     h->deblocking_filter = 1;
3976     h->slice_alpha_c0_offset = 0;
3977     h->slice_beta_offset = 0;
3978     if( h->pps.deblocking_filter_parameters_present ) {
3979         tmp= get_ue_golomb_31(&s->gb);
3980         if(tmp > 2){
3981             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
3982             return -1;
3983         }
3984         h->deblocking_filter= tmp;
3985         if(h->deblocking_filter < 2)
3986             h->deblocking_filter^= 1; // 1<->0
3987
3988         if( h->deblocking_filter ) {
3989             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
3990             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
3991         }
3992     }
3993
3994     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
3995        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type_nos != FF_I_TYPE)
3996        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type_nos == FF_B_TYPE)
3997        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
3998         h->deblocking_filter= 0;
3999
4000     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
4001         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
4002             /* Cheat slightly for speed:
4003                Do not bother to deblock across slices. */
4004             h->deblocking_filter = 2;
4005         } else {
4006             h0->max_contexts = 1;
4007             if(!h0->single_decode_warning) {
4008                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
4009                 h0->single_decode_warning = 1;
4010             }
4011             if(h != h0)
4012                 return 1; // deblocking switched inside frame
4013         }
4014     }
4015
4016 #if 0 //FMO
4017     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4018         slice_group_change_cycle= get_bits(&s->gb, ?);
4019 #endif
4020
4021     h0->last_slice_type = slice_type;
4022     h->slice_num = ++h0->current_slice;
4023     if(h->slice_num >= MAX_SLICES){
4024         av_log(s->avctx, AV_LOG_ERROR, "Too many slices, increase MAX_SLICES and recompile\n");
4025     }
4026
4027     for(j=0; j<2; j++){
4028         int *ref2frm= h->ref2frm[h->slice_num&(MAX_SLICES-1)][j];
4029         ref2frm[0]=
4030         ref2frm[1]= -1;
4031         for(i=0; i<16; i++)
4032             ref2frm[i+2]= 4*h->ref_list[j][i].frame_num
4033                           +(h->ref_list[j][i].reference&3);
4034         ref2frm[18+0]=
4035         ref2frm[18+1]= -1;
4036         for(i=16; i<48; i++)
4037             ref2frm[i+4]= 4*h->ref_list[j][i].frame_num
4038                           +(h->ref_list[j][i].reference&3);
4039     }
4040
4041     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4042     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
4043
4044     s->avctx->refs= h->sps.ref_frame_count;
4045
4046     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4047         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c%s%s pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
4048                h->slice_num,
4049                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4050                first_mb_in_slice,
4051                av_get_pict_type_char(h->slice_type), h->slice_type_fixed ? " fix" : "", h->nal_unit_type == NAL_IDR_SLICE ? " IDR" : "",
4052                pps_id, h->frame_num,
4053                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4054                h->ref_count[0], h->ref_count[1],
4055                s->qscale,
4056                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4057                h->use_weight,
4058                h->use_weight==1 && h->use_weight_chroma ? "c" : "",
4059                h->slice_type == FF_B_TYPE ? (h->direct_spatial_mv_pred ? "SPAT" : "TEMP") : ""
4060                );
4061     }
4062
4063     return 0;
4064 }
4065
4066 /**
4067  *
4068  */
4069 static inline int get_level_prefix(GetBitContext *gb){
4070     unsigned int buf;
4071     int log;
4072
4073     OPEN_READER(re, gb);
4074     UPDATE_CACHE(re, gb);
4075     buf=GET_CACHE(re, gb);
4076
4077     log= 32 - av_log2(buf);
4078 #ifdef TRACE
4079     print_bin(buf>>(32-log), log);
4080     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4081 #endif
4082
4083     LAST_SKIP_BITS(re, gb, log);
4084     CLOSE_READER(re, gb);
4085
4086     return log-1;
4087 }
4088
4089 static inline int get_dct8x8_allowed(H264Context *h){
4090     if(h->sps.direct_8x8_inference_flag)
4091         return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8                )*0x0001000100010001ULL));
4092     else
4093         return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8|MB_TYPE_DIRECT2)*0x0001000100010001ULL));
4094 }
4095
4096 /**
4097  * decodes a residual block.
4098  * @param n block index
4099  * @param scantable scantable
4100  * @param max_coeff number of coefficients in the block
4101  * @return <0 if an error occurred
4102  */
4103 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4104     MpegEncContext * const s = &h->s;
4105     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4106     int level[16];
4107     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4108
4109     //FIXME put trailing_onex into the context
4110
4111     if(n == CHROMA_DC_BLOCK_INDEX){
4112         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4113         total_coeff= coeff_token>>2;
4114     }else{
4115         if(n == LUMA_DC_BLOCK_INDEX){
4116             total_coeff= pred_non_zero_count(h, 0);
4117             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4118             total_coeff= coeff_token>>2;
4119         }else{
4120             total_coeff= pred_non_zero_count(h, n);
4121             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4122             total_coeff= coeff_token>>2;
4123             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4124         }
4125     }
4126
4127     //FIXME set last_non_zero?
4128
4129     if(total_coeff==0)
4130         return 0;
4131     if(total_coeff > (unsigned)max_coeff) {
4132         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4133         return -1;
4134     }
4135
4136     trailing_ones= coeff_token&3;
4137     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4138     assert(total_coeff<=16);
4139
4140     i = show_bits(gb, 3);
4141     skip_bits(gb, trailing_ones);
4142     level[0] = 1-((i&4)>>1);
4143     level[1] = 1-((i&2)   );
4144     level[2] = 1-((i&1)<<1);
4145
4146     if(trailing_ones<total_coeff) {
4147         int mask, prefix;
4148         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4149         int bitsi= show_bits(gb, LEVEL_TAB_BITS);
4150         int level_code= cavlc_level_tab[suffix_length][bitsi][0];
4151
4152         skip_bits(gb, cavlc_level_tab[suffix_length][bitsi][1]);
4153         if(level_code >= 100){
4154             prefix= level_code - 100;
4155             if(prefix == LEVEL_TAB_BITS)
4156                 prefix += get_level_prefix(gb);
4157
4158             //first coefficient has suffix_length equal to 0 or 1
4159             if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4160                 if(suffix_length)
4161                     level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4162                 else
4163                     level_code= (prefix<<suffix_length); //part
4164             }else if(prefix==14){
4165                 if(suffix_length)
4166                     level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4167                 else
4168                     level_code= prefix + get_bits(gb, 4); //part
4169             }else{
4170                 level_code= (15<<suffix_length) + get_bits(gb, prefix-3); //part
4171                 if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4172                 if(prefix>=16)
4173                     level_code += (1<<(prefix-3))-4096;
4174             }
4175
4176             if(trailing_ones < 3) level_code += 2;
4177
4178             suffix_length = 2;
4179             mask= -(level_code&1);
4180             level[trailing_ones]= (((2+level_code)>>1) ^ mask) - mask;
4181         }else{
4182             if(trailing_ones < 3) level_code += (level_code>>31)|1;
4183
4184             suffix_length = 1;
4185             if(level_code + 3U > 6U)
4186                 suffix_length++;
4187             level[trailing_ones]= level_code;
4188         }
4189
4190         //remaining coefficients have suffix_length > 0
4191         for(i=trailing_ones+1;i<total_coeff;i++) {
4192             static const unsigned int suffix_limit[7] = {0,3,6,12,24,48,INT_MAX };
4193             int bitsi= show_bits(gb, LEVEL_TAB_BITS);
4194             level_code= cavlc_level_tab[suffix_length][bitsi][0];
4195
4196             skip_bits(gb, cavlc_level_tab[suffix_length][bitsi][1]);
4197             if(level_code >= 100){
4198                 prefix= level_code - 100;
4199                 if(prefix == LEVEL_TAB_BITS){
4200                     prefix += get_level_prefix(gb);
4201                 }
4202                 if(prefix<15){
4203                     level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4204                 }else{
4205                     level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
4206                     if(prefix>=16)
4207                         level_code += (1<<(prefix-3))-4096;
4208                 }
4209                 mask= -(level_code&1);
4210                 level_code= (((2+level_code)>>1) ^ mask) - mask;
4211             }
4212             level[i]= level_code;
4213
4214             if(suffix_limit[suffix_length] + level_code > 2U*suffix_limit[suffix_length])
4215                 suffix_length++;
4216         }
4217     }
4218
4219     if(total_coeff == max_coeff)
4220         zeros_left=0;
4221     else{
4222         if(n == CHROMA_DC_BLOCK_INDEX)
4223             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4224         else
4225             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4226     }
4227
4228     coeff_num = zeros_left + total_coeff - 1;
4229     j = scantable[coeff_num];
4230     if(n > 24){
4231         block[j] = level[0];
4232         for(i=1;i<total_coeff;i++) {
4233             if(zeros_left <= 0)
4234                 run_before = 0;
4235             else if(zeros_left < 7){
4236                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4237             }else{
4238                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4239             }
4240             zeros_left -= run_before;
4241             coeff_num -= 1 + run_before;
4242             j= scantable[ coeff_num ];
4243
4244             block[j]= level[i];
4245         }
4246     }else{
4247         block[j] = (level[0] * qmul[j] + 32)>>6;
4248         for(i=1;i<total_coeff;i++) {
4249             if(zeros_left <= 0)
4250                 run_before = 0;
4251             else if(zeros_left < 7){
4252                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4253             }else{
4254                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4255             }
4256             zeros_left -= run_before;
4257             coeff_num -= 1 + run_before;
4258             j= scantable[ coeff_num ];
4259
4260             block[j]= (level[i] * qmul[j] + 32)>>6;
4261         }
4262     }
4263
4264     if(zeros_left<0){
4265         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4266         return -1;
4267     }
4268
4269     return 0;
4270 }
4271
4272 static void predict_field_decoding_flag(H264Context *h){
4273     MpegEncContext * const s = &h->s;
4274     const int mb_xy= h->mb_xy;
4275     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4276                 ? s->current_picture.mb_type[mb_xy-1]
4277                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4278                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4279                 : 0;
4280     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4281 }
4282
4283 /**
4284  * decodes a P_SKIP or B_SKIP macroblock
4285  */
4286 static void decode_mb_skip(H264Context *h){
4287     MpegEncContext * const s = &h->s;
4288     const int mb_xy= h->mb_xy;
4289     int mb_type=0;
4290
4291     memset(h->non_zero_count[mb_xy], 0, 16);
4292     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4293
4294     if(MB_FIELD)
4295         mb_type|= MB_TYPE_INTERLACED;
4296
4297     if( h->slice_type_nos == FF_B_TYPE )
4298     {
4299         // just for fill_caches. pred_direct_motion will set the real mb_type
4300         mb_type|= MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4301
4302         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4303         pred_direct_motion(h, &mb_type);
4304         mb_type|= MB_TYPE_SKIP;
4305     }
4306     else
4307     {
4308         int mx, my;
4309         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4310
4311         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4312         pred_pskip_motion(h, &mx, &my);
4313         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4314         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4315     }
4316
4317     write_back_motion(h, mb_type);
4318     s->current_picture.mb_type[mb_xy]= mb_type;
4319     s->current_picture.qscale_table[mb_xy]= s->qscale;
4320     h->slice_table[ mb_xy ]= h->slice_num;
4321     h->prev_mb_skipped= 1;
4322 }
4323
4324 /**
4325  * decodes a macroblock
4326  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4327  */
4328 static int decode_mb_cavlc(H264Context *h){
4329     MpegEncContext * const s = &h->s;
4330     int mb_xy;
4331     int partition_count;
4332     unsigned int mb_type, cbp;
4333     int dct8x8_allowed= h->pps.transform_8x8_mode;
4334
4335     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4336
4337     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4338     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4339                 down the code */
4340     if(h->slice_type_nos != FF_I_TYPE){
4341         if(s->mb_skip_run==-1)
4342             s->mb_skip_run= get_ue_golomb(&s->gb);
4343
4344         if (s->mb_skip_run--) {
4345             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4346                 if(s->mb_skip_run==0)
4347                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4348                 else
4349                     predict_field_decoding_flag(h);
4350             }
4351             decode_mb_skip(h);
4352             return 0;
4353         }
4354     }
4355     if(FRAME_MBAFF){
4356         if( (s->mb_y&1) == 0 )
4357             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4358     }
4359
4360     h->prev_mb_skipped= 0;
4361
4362     mb_type= get_ue_golomb(&s->gb);
4363     if(h->slice_type_nos == FF_B_TYPE){
4364         if(mb_type < 23){
4365             partition_count= b_mb_type_info[mb_type].partition_count;
4366             mb_type=         b_mb_type_info[mb_type].type;
4367         }else{
4368             mb_type -= 23;
4369             goto decode_intra_mb;
4370         }
4371     }else if(h->slice_type_nos == FF_P_TYPE){
4372         if(mb_type < 5){
4373             partition_count= p_mb_type_info[mb_type].partition_count;
4374             mb_type=         p_mb_type_info[mb_type].type;
4375         }else{
4376             mb_type -= 5;
4377             goto decode_intra_mb;
4378         }
4379     }else{
4380        assert(h->slice_type_nos == FF_I_TYPE);
4381         if(h->slice_type == FF_SI_TYPE && mb_type)
4382             mb_type--;
4383 decode_intra_mb:
4384         if(mb_type > 25){
4385             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4386             return -1;
4387         }
4388         partition_count=0;
4389         cbp= i_mb_type_info[mb_type].cbp;
4390         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4391         mb_type= i_mb_type_info[mb_type].type;
4392     }
4393
4394     if(MB_FIELD)
4395         mb_type |= MB_TYPE_INTERLACED;
4396
4397     h->slice_table[ mb_xy ]= h->slice_num;
4398
4399     if(IS_INTRA_PCM(mb_type)){
4400         unsigned int x;
4401
4402         // We assume these blocks are very rare so we do not optimize it.
4403         align_get_bits(&s->gb);
4404
4405         // The pixels are stored in the same order as levels in h->mb array.
4406         for(x=0; x < (CHROMA ? 384 : 256); x++){
4407             ((uint8_t*)h->mb)[x]= get_bits(&s->gb, 8);
4408         }
4409
4410         // In deblocking, the quantizer is 0
4411         s->current_picture.qscale_table[mb_xy]= 0;
4412         // All coeffs are present
4413         memset(h->non_zero_count[mb_xy], 16, 16);
4414
4415         s->current_picture.mb_type[mb_xy]= mb_type;
4416         return 0;
4417     }
4418
4419     if(MB_MBAFF){
4420         h->ref_count[0] <<= 1;
4421         h->ref_count[1] <<= 1;
4422     }
4423
4424     fill_caches(h, mb_type, 0);
4425
4426     //mb_pred
4427     if(IS_INTRA(mb_type)){
4428         int pred_mode;
4429 //            init_top_left_availability(h);
4430         if(IS_INTRA4x4(mb_type)){
4431             int i;
4432             int di = 1;
4433             if(dct8x8_allowed && get_bits1(&s->gb)){
4434                 mb_type |= MB_TYPE_8x8DCT;
4435                 di = 4;
4436             }
4437
4438 //                fill_intra4x4_pred_table(h);
4439             for(i=0; i<16; i+=di){
4440                 int mode= pred_intra_mode(h, i);
4441
4442                 if(!get_bits1(&s->gb)){
4443                     const int rem_mode= get_bits(&s->gb, 3);
4444                     mode = rem_mode + (rem_mode >= mode);
4445                 }
4446
4447                 if(di==4)
4448                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4449                 else
4450                     h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4451             }
4452             write_back_intra_pred_mode(h);
4453             if( check_intra4x4_pred_mode(h) < 0)
4454                 return -1;
4455         }else{
4456             h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4457             if(h->intra16x16_pred_mode < 0)
4458                 return -1;
4459         }
4460         if(CHROMA){
4461             pred_mode= check_intra_pred_mode(h, get_ue_golomb_31(&s->gb));
4462             if(pred_mode < 0)
4463                 return -1;
4464             h->chroma_pred_mode= pred_mode;
4465         }
4466     }else if(partition_count==4){
4467         int i, j, sub_partition_count[4], list, ref[2][4];
4468
4469         if(h->slice_type_nos == FF_B_TYPE){
4470             for(i=0; i<4; i++){
4471                 h->sub_mb_type[i]= get_ue_golomb_31(&s->gb);
4472                 if(h->sub_mb_type[i] >=13){
4473                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4474                     return -1;
4475                 }
4476                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4477                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4478             }
4479             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4480                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4481                 pred_direct_motion(h, &mb_type);
4482                 h->ref_cache[0][scan8[4]] =
4483                 h->ref_cache[1][scan8[4]] =
4484                 h->ref_cache[0][scan8[12]] =
4485                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4486             }
4487         }else{
4488             assert(h->slice_type_nos == FF_P_TYPE); //FIXME SP correct ?
4489             for(i=0; i<4; i++){
4490                 h->sub_mb_type[i]= get_ue_golomb_31(&s->gb);
4491                 if(h->sub_mb_type[i] >=4){
4492                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4493                     return -1;
4494                 }
4495                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4496                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4497             }
4498         }
4499
4500         for(list=0; list<h->list_count; list++){
4501             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4502             for(i=0; i<4; i++){
4503                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4504                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4505                     unsigned int tmp;
4506                     if(ref_count == 1){
4507                         tmp= 0;
4508                     }else if(ref_count == 2){
4509                         tmp= get_bits1(&s->gb)^1;
4510                     }else{
4511                         tmp= get_ue_golomb_31(&s->gb);
4512                         if(tmp>=ref_count){
4513                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4514                             return -1;
4515                         }
4516                     }
4517                     ref[list][i]= tmp;
4518                 }else{
4519                  //FIXME
4520                     ref[list][i] = -1;
4521                 }
4522             }
4523         }
4524
4525         if(dct8x8_allowed)
4526             dct8x8_allowed = get_dct8x8_allowed(h);
4527
4528         for(list=0; list<h->list_count; list++){
4529             for(i=0; i<4; i++){
4530                 if(IS_DIRECT(h->sub_mb_type[i])) {
4531                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4532                     continue;
4533                 }
4534                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4535                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4536
4537                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4538                     const int sub_mb_type= h->sub_mb_type[i];
4539                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4540                     for(j=0; j<sub_partition_count[i]; j++){
4541                         int mx, my;
4542                         const int index= 4*i + block_width*j;
4543                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4544                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4545                         mx += get_se_golomb(&s->gb);
4546                         my += get_se_golomb(&s->gb);
4547                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4548
4549                         if(IS_SUB_8X8(sub_mb_type)){
4550                             mv_cache[ 1 ][0]=
4551                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4552                             mv_cache[ 1 ][1]=
4553                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4554                         }else if(IS_SUB_8X4(sub_mb_type)){
4555                             mv_cache[ 1 ][0]= mx;
4556                             mv_cache[ 1 ][1]= my;
4557                         }else if(IS_SUB_4X8(sub_mb_type)){
4558                             mv_cache[ 8 ][0]= mx;
4559                             mv_cache[ 8 ][1]= my;
4560                         }
4561                         mv_cache[ 0 ][0]= mx;
4562                         mv_cache[ 0 ][1]= my;
4563                     }
4564                 }else{
4565                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4566                     p[0] = p[1]=
4567                     p[8] = p[9]= 0;
4568                 }
4569             }
4570         }
4571     }else if(IS_DIRECT(mb_type)){
4572         pred_direct_motion(h, &mb_type);
4573         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4574     }else{
4575         int list, mx, my, i;
4576          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4577         if(IS_16X16(mb_type)){
4578             for(list=0; list<h->list_count; list++){
4579                     unsigned int val;
4580                     if(IS_DIR(mb_type, 0, list)){
4581                         if(h->ref_count[list]==1){
4582                             val= 0;
4583                         }else if(h->ref_count[list]==2){
4584                             val= get_bits1(&s->gb)^1;
4585                         }else{
4586                             val= get_ue_golomb_31(&s->gb);
4587                             if(val >= h->ref_count[list]){
4588                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4589                                 return -1;
4590                             }
4591                         }
4592                     }else
4593                         val= LIST_NOT_USED&0xFF;
4594                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4595             }
4596             for(list=0; list<h->list_count; list++){
4597                 unsigned int val;
4598                 if(IS_DIR(mb_type, 0, list)){
4599                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4600                     mx += get_se_golomb(&s->gb);
4601                     my += get_se_golomb(&s->gb);
4602                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4603
4604                     val= pack16to32(mx,my);
4605                 }else
4606                     val=0;
4607                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4608             }
4609         }
4610         else if(IS_16X8(mb_type)){
4611             for(list=0; list<h->list_count; list++){
4612                     for(i=0; i<2; i++){
4613                         unsigned int val;
4614                         if(IS_DIR(mb_type, i, list)){
4615                             if(h->ref_count[list] == 1){
4616                                 val= 0;
4617                             }else if(h->ref_count[list] == 2){
4618                                 val= get_bits1(&s->gb)^1;
4619                             }else{
4620                                 val= get_ue_golomb_31(&s->gb);
4621                                 if(val >= h->ref_count[list]){
4622                                     av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4623                                     return -1;
4624                                 }
4625                             }
4626                         }else
4627                             val= LIST_NOT_USED&0xFF;
4628                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4629                     }
4630             }
4631             for(list=0; list<h->list_count; list++){
4632                 for(i=0; i<2; i++){
4633                     unsigned int val;
4634                     if(IS_DIR(mb_type, i, list)){
4635                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4636                         mx += get_se_golomb(&s->gb);
4637                         my += get_se_golomb(&s->gb);
4638                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4639
4640                         val= pack16to32(mx,my);
4641                     }else
4642                         val=0;
4643                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4644                 }
4645             }
4646         }else{
4647             assert(IS_8X16(mb_type));
4648             for(list=0; list<h->list_count; list++){
4649                     for(i=0; i<2; i++){
4650                         unsigned int val;
4651                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4652                             if(h->ref_count[list]==1){
4653                                 val= 0;
4654                             }else if(h->ref_count[list]==2){
4655                                 val= get_bits1(&s->gb)^1;
4656                             }else{
4657                                 val= get_ue_golomb_31(&s->gb);
4658                                 if(val >= h->ref_count[list]){
4659                                     av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4660                                     return -1;
4661                                 }
4662                             }
4663                         }else
4664                             val= LIST_NOT_USED&0xFF;
4665                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4666                     }
4667             }
4668             for(list=0; list<h->list_count; list++){
4669                 for(i=0; i<2; i++){
4670                     unsigned int val;
4671                     if(IS_DIR(mb_type, i, list)){
4672                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4673                         mx += get_se_golomb(&s->gb);
4674                         my += get_se_golomb(&s->gb);
4675                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4676
4677                         val= pack16to32(mx,my);
4678                     }else
4679                         val=0;
4680                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4681                 }
4682             }
4683         }
4684     }
4685
4686     if(IS_INTER(mb_type))
4687         write_back_motion(h, mb_type);
4688
4689     if(!IS_INTRA16x16(mb_type)){
4690         cbp= get_ue_golomb(&s->gb);
4691         if(cbp > 47){
4692             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4693             return -1;
4694         }
4695
4696         if(CHROMA){
4697             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp[cbp];
4698             else                     cbp= golomb_to_inter_cbp   [cbp];
4699         }else{
4700             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp_gray[cbp];
4701             else                     cbp= golomb_to_inter_cbp_gray[cbp];
4702         }
4703     }
4704     h->cbp = cbp;
4705
4706     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4707         if(get_bits1(&s->gb)){
4708             mb_type |= MB_TYPE_8x8DCT;
4709             h->cbp_table[mb_xy]= cbp;
4710         }
4711     }
4712     s->current_picture.mb_type[mb_xy]= mb_type;
4713
4714     if(cbp || IS_INTRA16x16(mb_type)){
4715         int i8x8, i4x4, chroma_idx;
4716         int dquant;
4717         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4718         const uint8_t *scan, *scan8x8, *dc_scan;
4719
4720 //        fill_non_zero_count_cache(h);
4721
4722         if(IS_INTERLACED(mb_type)){
4723             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4724             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4725             dc_scan= luma_dc_field_scan;
4726         }else{
4727             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4728             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4729             dc_scan= luma_dc_zigzag_scan;
4730         }
4731
4732         dquant= get_se_golomb(&s->gb);
4733
4734         if( dquant > 25 || dquant < -26 ){
4735             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4736             return -1;
4737         }
4738
4739         s->qscale += dquant;
4740         if(((unsigned)s->qscale) > 51){
4741             if(s->qscale<0) s->qscale+= 52;
4742             else            s->qscale-= 52;
4743         }
4744
4745         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4746         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4747         if(IS_INTRA16x16(mb_type)){
4748             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4749                 return -1; //FIXME continue if partitioned and other return -1 too
4750             }
4751
4752             assert((cbp&15) == 0 || (cbp&15) == 15);
4753
4754             if(cbp&15){
4755                 for(i8x8=0; i8x8<4; i8x8++){
4756                     for(i4x4=0; i4x4<4; i4x4++){
4757                         const int index= i4x4 + 4*i8x8;
4758                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4759                             return -1;
4760                         }
4761                     }
4762                 }
4763             }else{
4764                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4765             }
4766         }else{
4767             for(i8x8=0; i8x8<4; i8x8++){
4768                 if(cbp & (1<<i8x8)){
4769                     if(IS_8x8DCT(mb_type)){
4770                         DCTELEM *buf = &h->mb[64*i8x8];
4771                         uint8_t *nnz;
4772                         for(i4x4=0; i4x4<4; i4x4++){
4773                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4774                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4775                                 return -1;
4776                         }
4777                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4778                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4779                     }else{
4780                         for(i4x4=0; i4x4<4; i4x4++){
4781                             const int index= i4x4 + 4*i8x8;
4782
4783                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4784                                 return -1;
4785                             }
4786                         }
4787                     }
4788                 }else{
4789                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4790                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4791                 }
4792             }
4793         }
4794
4795         if(cbp&0x30){
4796             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4797                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4798                     return -1;
4799                 }
4800         }
4801
4802         if(cbp&0x20){
4803             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4804                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4805                 for(i4x4=0; i4x4<4; i4x4++){
4806                     const int index= 16 + 4*chroma_idx + i4x4;
4807                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4808                         return -1;
4809                     }
4810                 }
4811             }
4812         }else{
4813             uint8_t * const nnz= &h->non_zero_count_cache[0];
4814             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4815             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4816         }
4817     }else{
4818         uint8_t * const nnz= &h->non_zero_count_cache[0];
4819         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4820         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4821         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4822     }
4823     s->current_picture.qscale_table[mb_xy]= s->qscale;
4824     write_back_non_zero_count(h);
4825
4826     if(MB_MBAFF){
4827         h->ref_count[0] >>= 1;
4828         h->ref_count[1] >>= 1;
4829     }
4830
4831     return 0;
4832 }
4833
4834 static int decode_cabac_field_decoding_flag(H264Context *h) {
4835     MpegEncContext * const s = &h->s;
4836     const int mb_x = s->mb_x;
4837     const int mb_y = s->mb_y & ~1;
4838     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4839     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4840
4841     unsigned int ctx = 0;
4842
4843     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
4844         ctx += 1;
4845     }
4846     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
4847         ctx += 1;
4848     }
4849
4850     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
4851 }
4852
4853 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4854     uint8_t *state= &h->cabac_state[ctx_base];
4855     int mb_type;
4856
4857     if(intra_slice){
4858         MpegEncContext * const s = &h->s;
4859         const int mba_xy = h->left_mb_xy[0];
4860         const int mbb_xy = h->top_mb_xy;
4861         int ctx=0;
4862         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
4863             ctx++;
4864         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
4865             ctx++;
4866         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
4867             return 0;   /* I4x4 */
4868         state += 2;
4869     }else{
4870         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
4871             return 0;   /* I4x4 */
4872     }
4873
4874     if( get_cabac_terminate( &h->cabac ) )
4875         return 25;  /* PCM */
4876
4877     mb_type = 1; /* I16x16 */
4878     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
4879     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
4880         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
4881     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
4882     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
4883     return mb_type;
4884 }
4885
4886 static int decode_cabac_mb_type_b( H264Context *h ) {
4887     MpegEncContext * const s = &h->s;
4888
4889         const int mba_xy = h->left_mb_xy[0];
4890         const int mbb_xy = h->top_mb_xy;
4891         int ctx = 0;
4892         int bits;
4893         assert(h->slice_type_nos == FF_B_TYPE);
4894
4895         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
4896             ctx++;
4897         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
4898             ctx++;
4899
4900         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
4901             return 0; /* B_Direct_16x16 */
4902
4903         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
4904             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
4905         }
4906
4907         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
4908         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
4909         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
4910         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4911         if( bits < 8 )
4912             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
4913         else if( bits == 13 ) {
4914             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
4915         } else if( bits == 14 )
4916             return 11; /* B_L1_L0_8x16 */
4917         else if( bits == 15 )
4918             return 22; /* B_8x8 */
4919
4920         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4921         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
4922 }
4923
4924 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
4925     MpegEncContext * const s = &h->s;
4926     int mba_xy, mbb_xy;
4927     int ctx = 0;
4928
4929     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
4930         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
4931         mba_xy = mb_xy - 1;
4932         if( (mb_y&1)
4933             && h->slice_table[mba_xy] == h->slice_num
4934             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
4935             mba_xy += s->mb_stride;
4936         if( MB_FIELD ){
4937             mbb_xy = mb_xy - s->mb_stride;
4938             if( !(mb_y&1)
4939                 && h->slice_table[mbb_xy] == h->slice_num
4940                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
4941                 mbb_xy -= s->mb_stride;
4942         }else
4943             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
4944     }else{
4945         int mb_xy = h->mb_xy;
4946         mba_xy = mb_xy - 1;
4947         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
4948     }
4949
4950     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
4951         ctx++;
4952     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
4953         ctx++;
4954
4955     if( h->slice_type_nos == FF_B_TYPE )
4956         ctx += 13;
4957     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
4958 }
4959
4960 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
4961     int mode = 0;
4962
4963     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
4964         return pred_mode;
4965
4966     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
4967     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
4968     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
4969
4970     if( mode >= pred_mode )
4971         return mode + 1;
4972     else
4973         return mode;
4974 }
4975
4976 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
4977     const int mba_xy = h->left_mb_xy[0];
4978     const int mbb_xy = h->top_mb_xy;
4979
4980     int ctx = 0;
4981
4982     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
4983     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
4984         ctx++;
4985
4986     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
4987         ctx++;
4988
4989     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
4990         return 0;
4991
4992     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4993         return 1;
4994     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4995         return 2;
4996     else
4997         return 3;
4998 }
4999
5000 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5001     int cbp_b, cbp_a, ctx, cbp = 0;
5002
5003     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
5004     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
5005
5006     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
5007     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
5008     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
5009     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
5010     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
5011     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
5012     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
5013     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
5014     return cbp;
5015 }
5016 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5017     int ctx;
5018     int cbp_a, cbp_b;
5019
5020     cbp_a = (h->left_cbp>>4)&0x03;
5021     cbp_b = (h-> top_cbp>>4)&0x03;
5022
5023     ctx = 0;
5024     if( cbp_a > 0 ) ctx++;
5025     if( cbp_b > 0 ) ctx += 2;
5026     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5027         return 0;
5028
5029     ctx = 4;
5030     if( cbp_a == 2 ) ctx++;
5031     if( cbp_b == 2 ) ctx += 2;
5032     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5033 }
5034 static int decode_cabac_mb_dqp( H264Context *h) {
5035     int   ctx= h->last_qscale_diff != 0;
5036     int   val = 0;
5037
5038     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5039         ctx= 2+(ctx>>1);
5040         val++;
5041         if(val > 102) //prevent infinite loop
5042             return INT_MIN;
5043     }
5044
5045     if( val&0x01 )
5046         return   (val + 1)>>1 ;
5047     else
5048         return -((val + 1)>>1);
5049 }
5050 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5051     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5052         return 0;   /* 8x8 */
5053     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5054         return 1;   /* 8x4 */
5055     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5056         return 2;   /* 4x8 */
5057     return 3;       /* 4x4 */
5058 }
5059 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5060     int type;
5061     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5062         return 0;   /* B_Direct_8x8 */
5063     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5064         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5065     type = 3;
5066     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5067         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5068             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5069         type += 4;
5070     }
5071     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5072     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5073     return type;
5074 }
5075
5076 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5077     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5078 }
5079
5080 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5081     int refa = h->ref_cache[list][scan8[n] - 1];
5082     int refb = h->ref_cache[list][scan8[n] - 8];
5083     int ref  = 0;
5084     int ctx  = 0;
5085
5086     if( h->slice_type_nos == FF_B_TYPE) {
5087         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5088             ctx++;
5089         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5090             ctx += 2;
5091     } else {
5092         if( refa > 0 )
5093             ctx++;
5094         if( refb > 0 )
5095             ctx += 2;
5096     }
5097
5098     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5099         ref++;
5100         ctx = (ctx>>2)+4;
5101         if(ref >= 32 /*h->ref_list[list]*/){
5102             return -1;
5103         }
5104     }
5105     return ref;
5106 }
5107
5108 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5109     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5110                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5111     int ctxbase = (l == 0) ? 40 : 47;
5112     int mvd;
5113     int ctx = (amvd>2) + (amvd>32);
5114
5115     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5116         return 0;
5117
5118     mvd= 1;
5119     ctx= 3;
5120     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5121         mvd++;
5122         if( ctx < 6 )
5123             ctx++;
5124     }
5125
5126     if( mvd >= 9 ) {
5127         int k = 3;
5128         while( get_cabac_bypass( &h->cabac ) ) {
5129             mvd += 1 << k;
5130             k++;
5131             if(k>24){
5132                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5133                 return INT_MIN;
5134             }
5135         }
5136         while( k-- ) {
5137             if( get_cabac_bypass( &h->cabac ) )
5138                 mvd += 1 << k;
5139         }
5140     }
5141     return get_cabac_bypass_sign( &h->cabac, -mvd );
5142 }
5143
5144 static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
5145     int nza, nzb;
5146     int ctx = 0;
5147
5148     if( is_dc ) {
5149         if( cat == 0 ) {
5150             nza = h->left_cbp&0x100;
5151             nzb = h-> top_cbp&0x100;
5152         } else {
5153             nza = (h->left_cbp>>(6+idx))&0x01;
5154             nzb = (h-> top_cbp>>(6+idx))&0x01;
5155         }
5156     } else {
5157         assert(cat == 1 || cat == 2 || cat == 4);
5158         nza = h->non_zero_count_cache[scan8[idx] - 1];
5159         nzb = h->non_zero_count_cache[scan8[idx] - 8];
5160     }
5161
5162     if( nza > 0 )
5163         ctx++;
5164
5165     if( nzb > 0 )
5166         ctx += 2;
5167
5168     return ctx + 4 * cat;
5169 }
5170
5171 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8[63]) = {
5172     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5173     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5174     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5175     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5176 };
5177
5178 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
5179     static const int significant_coeff_flag_offset[2][6] = {
5180       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5181       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5182     };
5183     static const int last_coeff_flag_offset[2][6] = {
5184       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5185       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5186     };
5187     static const int coeff_abs_level_m1_offset[6] = {
5188         227+0, 227+10, 227+20, 227+30, 227+39, 426
5189     };
5190     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5191       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5192         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5193         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5194        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5195       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5196         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5197         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5198         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5199     };
5200     /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
5201      * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
5202      * map node ctx => cabac ctx for level=1 */
5203     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
5204     /* map node ctx => cabac ctx for level>1 */
5205     static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
5206     static const uint8_t coeff_abs_level_transition[2][8] = {
5207     /* update node ctx after decoding a level=1 */
5208         { 1, 2, 3, 3, 4, 5, 6, 7 },
5209     /* update node ctx after decoding a level>1 */
5210         { 4, 4, 4, 4, 5, 6, 7, 7 }
5211     };
5212
5213     int index[64];
5214
5215     int av_unused last;
5216     int coeff_count = 0;
5217     int node_ctx = 0;
5218
5219     uint8_t *significant_coeff_ctx_base;
5220     uint8_t *last_coeff_ctx_base;
5221     uint8_t *abs_level_m1_ctx_base;
5222
5223 #ifndef ARCH_X86
5224 #define CABAC_ON_STACK
5225 #endif
5226 #ifdef CABAC_ON_STACK
5227 #define CC &cc
5228     CABACContext cc;
5229     cc.range     = h->cabac.range;
5230     cc.low       = h->cabac.low;
5231     cc.bytestream= h->cabac.bytestream;
5232 #else
5233 #define CC &h->cabac
5234 #endif
5235
5236
5237     /* cat: 0-> DC 16x16  n = 0
5238      *      1-> AC 16x16  n = luma4x4idx
5239      *      2-> Luma4x4   n = luma4x4idx
5240      *      3-> DC Chroma n = iCbCr
5241      *      4-> AC Chroma n = 16 + 4 * iCbCr + chroma4x4idx
5242      *      5-> Luma8x8   n = 4 * luma8x8idx
5243      */
5244
5245     /* read coded block flag */
5246     if( is_dc || cat != 5 ) {
5247         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
5248             if( !is_dc )
5249                 h->non_zero_count_cache[scan8[n]] = 0;
5250
5251 #ifdef CABAC_ON_STACK
5252             h->cabac.range     = cc.range     ;
5253             h->cabac.low       = cc.low       ;
5254             h->cabac.bytestream= cc.bytestream;
5255 #endif
5256             return;
5257         }
5258     }
5259
5260     significant_coeff_ctx_base = h->cabac_state
5261         + significant_coeff_flag_offset[MB_FIELD][cat];
5262     last_coeff_ctx_base = h->cabac_state
5263         + last_coeff_flag_offset[MB_FIELD][cat];
5264     abs_level_m1_ctx_base = h->cabac_state
5265         + coeff_abs_level_m1_offset[cat];
5266
5267     if( !is_dc && cat == 5 ) {
5268 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5269         for(last= 0; last < coefs; last++) { \
5270             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5271             if( get_cabac( CC, sig_ctx )) { \
5272                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5273                 index[coeff_count++] = last; \
5274                 if( get_cabac( CC, last_ctx ) ) { \
5275                     last= max_coeff; \
5276                     break; \
5277                 } \
5278             } \
5279         }\
5280         if( last == max_coeff -1 ) {\
5281             index[coeff_count++] = last;\
5282         }
5283         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5284 #if defined(ARCH_X86) && defined(HAVE_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
5285         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5286     } else {
5287         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5288 #else
5289         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5290     } else {
5291         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5292 #endif
5293     }
5294     assert(coeff_count > 0);
5295
5296     if( is_dc ) {
5297         if( cat == 0 )
5298             h->cbp_table[h->mb_xy] |= 0x100;
5299         else
5300             h->cbp_table[h->mb_xy] |= 0x40 << n;
5301     } else {
5302         if( cat == 5 )
5303             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5304         else {
5305             assert( cat == 1 || cat == 2 || cat == 4 );
5306             h->non_zero_count_cache[scan8[n]] = coeff_count;
5307         }
5308     }
5309
5310     do {
5311         uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
5312
5313         int j= scantable[index[--coeff_count]];
5314
5315         if( get_cabac( CC, ctx ) == 0 ) {
5316             node_ctx = coeff_abs_level_transition[0][node_ctx];
5317             if( is_dc ) {
5318                 block[j] = get_cabac_bypass_sign( CC, -1);
5319             }else{
5320                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5321             }
5322         } else {
5323             int coeff_abs = 2;
5324             ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
5325             node_ctx = coeff_abs_level_transition[1][node_ctx];
5326
5327             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5328                 coeff_abs++;
5329             }
5330
5331             if( coeff_abs >= 15 ) {
5332                 int j = 0;
5333                 while( get_cabac_bypass( CC ) ) {
5334                     j++;
5335                 }
5336
5337                 coeff_abs=1;
5338                 while( j-- ) {
5339                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5340                 }
5341                 coeff_abs+= 14;
5342             }
5343
5344             if( is_dc ) {
5345                 block[j] = get_cabac_bypass_sign( CC, -coeff_abs );
5346             }else{
5347                 block[j] = (get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32) >> 6;
5348             }
5349         }
5350     } while( coeff_count );
5351 #ifdef CABAC_ON_STACK
5352             h->cabac.range     = cc.range     ;
5353             h->cabac.low       = cc.low       ;
5354             h->cabac.bytestream= cc.bytestream;
5355 #endif
5356
5357 }
5358
5359 #ifndef CONFIG_SMALL
5360 static void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5361     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 1);
5362 }
5363
5364 static void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5365     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
5366 }
5367 #endif
5368
5369 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5370 #ifdef CONFIG_SMALL
5371     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, cat == 0 || cat == 3);
5372 #else
5373     if( cat == 0 || cat == 3 ) decode_cabac_residual_dc(h, block, cat, n, scantable, qmul, max_coeff);
5374     else decode_cabac_residual_nondc(h, block, cat, n, scantable, qmul, max_coeff);
5375 #endif
5376 }
5377
5378 static inline void compute_mb_neighbors(H264Context *h)
5379 {
5380     MpegEncContext * const s = &h->s;
5381     const int mb_xy  = h->mb_xy;
5382     h->top_mb_xy     = mb_xy - s->mb_stride;
5383     h->left_mb_xy[0] = mb_xy - 1;
5384     if(FRAME_MBAFF){
5385         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5386         const int top_pair_xy      = pair_xy     - s->mb_stride;
5387         const int top_mb_field_flag  = IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5388         const int left_mb_field_flag = IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5389         const int curr_mb_field_flag = MB_FIELD;
5390         const int bottom = (s->mb_y & 1);
5391
5392         if (curr_mb_field_flag && (bottom || top_mb_field_flag)){
5393             h->top_mb_xy -= s->mb_stride;
5394         }
5395         if (!left_mb_field_flag == curr_mb_field_flag) {
5396             h->left_mb_xy[0] = pair_xy - 1;
5397         }
5398     } else if (FIELD_PICTURE) {
5399         h->top_mb_xy -= s->mb_stride;
5400     }
5401     return;
5402 }
5403
5404 /**
5405  * decodes a macroblock
5406  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5407  */
5408 static int decode_mb_cabac(H264Context *h) {
5409     MpegEncContext * const s = &h->s;
5410     int mb_xy;
5411     int mb_type, partition_count, cbp = 0;
5412     int dct8x8_allowed= h->pps.transform_8x8_mode;
5413
5414     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5415
5416     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5417     if( h->slice_type_nos != FF_I_TYPE ) {
5418         int skip;
5419         /* a skipped mb needs the aff flag from the following mb */
5420         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5421             predict_field_decoding_flag(h);
5422         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5423             skip = h->next_mb_skipped;
5424         else
5425             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5426         /* read skip flags */
5427         if( skip ) {
5428             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5429                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5430                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5431                 if(!h->next_mb_skipped)
5432                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5433             }
5434
5435             decode_mb_skip(h);
5436
5437             h->cbp_table[mb_xy] = 0;
5438             h->chroma_pred_mode_table[mb_xy] = 0;
5439             h->last_qscale_diff = 0;
5440
5441             return 0;
5442
5443         }
5444     }
5445     if(FRAME_MBAFF){
5446         if( (s->mb_y&1) == 0 )
5447             h->mb_mbaff =
5448             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5449     }
5450
5451     h->prev_mb_skipped = 0;
5452
5453     compute_mb_neighbors(h);
5454
5455     if( h->slice_type_nos == FF_B_TYPE ) {
5456         mb_type = decode_cabac_mb_type_b( h );
5457         if( mb_type < 23 ){
5458             partition_count= b_mb_type_info[mb_type].partition_count;
5459             mb_type=         b_mb_type_info[mb_type].type;
5460         }else{
5461             mb_type -= 23;
5462             goto decode_intra_mb;
5463         }
5464     } else if( h->slice_type_nos == FF_P_TYPE ) {
5465         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
5466             /* P-type */
5467             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
5468                 /* P_L0_D16x16, P_8x8 */
5469                 mb_type= 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
5470             } else {
5471                 /* P_L0_D8x16, P_L0_D16x8 */
5472                 mb_type= 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
5473             }
5474             partition_count= p_mb_type_info[mb_type].partition_count;
5475             mb_type=         p_mb_type_info[mb_type].type;
5476         } else {
5477             mb_type= decode_cabac_intra_mb_type(h, 17, 0);
5478             goto decode_intra_mb;
5479         }
5480     } else {
5481         mb_type= decode_cabac_intra_mb_type(h, 3, 1);
5482         if(h->slice_type == FF_SI_TYPE && mb_type)
5483             mb_type--;
5484         assert(h->slice_type_nos == FF_I_TYPE);
5485 decode_intra_mb:
5486         partition_count = 0;
5487         cbp= i_mb_type_info[mb_type].cbp;
5488         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5489         mb_type= i_mb_type_info[mb_type].type;
5490     }
5491     if(MB_FIELD)
5492         mb_type |= MB_TYPE_INTERLACED;
5493
5494     h->slice_table[ mb_xy ]= h->slice_num;
5495
5496     if(IS_INTRA_PCM(mb_type)) {
5497         const uint8_t *ptr;
5498
5499         // We assume these blocks are very rare so we do not optimize it.
5500         // FIXME The two following lines get the bitstream position in the cabac
5501         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5502         ptr= h->cabac.bytestream;
5503         if(h->cabac.low&0x1) ptr--;
5504         if(CABAC_BITS==16){
5505             if(h->cabac.low&0x1FF) ptr--;
5506         }
5507
5508         // The pixels are stored in the same order as levels in h->mb array.
5509         memcpy(h->mb, ptr, 256); ptr+=256;
5510         if(CHROMA){
5511             memcpy(h->mb+128, ptr, 128); ptr+=128;
5512         }
5513
5514         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5515
5516         // All blocks are present
5517         h->cbp_table[mb_xy] = 0x1ef;
5518         h->chroma_pred_mode_table[mb_xy] = 0;
5519         // In deblocking, the quantizer is 0
5520         s->current_picture.qscale_table[mb_xy]= 0;
5521         // All coeffs are present
5522         memset(h->non_zero_count[mb_xy], 16, 16);
5523         s->current_picture.mb_type[mb_xy]= mb_type;
5524         h->last_qscale_diff = 0;
5525         return 0;
5526     }
5527
5528     if(MB_MBAFF){
5529         h->ref_count[0] <<= 1;
5530         h->ref_count[1] <<= 1;
5531     }
5532
5533     fill_caches(h, mb_type, 0);
5534
5535     if( IS_INTRA( mb_type ) ) {
5536         int i, pred_mode;
5537         if( IS_INTRA4x4( mb_type ) ) {
5538             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5539                 mb_type |= MB_TYPE_8x8DCT;
5540                 for( i = 0; i < 16; i+=4 ) {
5541                     int pred = pred_intra_mode( h, i );
5542                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5543                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5544                 }
5545             } else {
5546                 for( i = 0; i < 16; i++ ) {
5547                     int pred = pred_intra_mode( h, i );
5548                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5549
5550                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5551                 }
5552             }
5553             write_back_intra_pred_mode(h);
5554             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5555         } else {
5556             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5557             if( h->intra16x16_pred_mode < 0 ) return -1;
5558         }
5559         if(CHROMA){
5560             h->chroma_pred_mode_table[mb_xy] =
5561             pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5562
5563             pred_mode= check_intra_pred_mode( h, pred_mode );
5564             if( pred_mode < 0 ) return -1;
5565             h->chroma_pred_mode= pred_mode;
5566         }
5567     } else if( partition_count == 4 ) {
5568         int i, j, sub_partition_count[4], list, ref[2][4];
5569
5570         if( h->slice_type_nos == FF_B_TYPE ) {
5571             for( i = 0; i < 4; i++ ) {
5572                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5573                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5574                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5575             }
5576             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5577                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5578                 pred_direct_motion(h, &mb_type);
5579                 h->ref_cache[0][scan8[4]] =
5580                 h->ref_cache[1][scan8[4]] =
5581                 h->ref_cache[0][scan8[12]] =
5582                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5583                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5584                     for( i = 0; i < 4; i++ )
5585                         if( IS_DIRECT(h->sub_mb_type[i]) )
5586                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5587                 }
5588             }
5589         } else {
5590             for( i = 0; i < 4; i++ ) {
5591                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5592                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5593                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5594             }
5595         }
5596
5597         for( list = 0; list < h->list_count; list++ ) {
5598                 for( i = 0; i < 4; i++ ) {
5599                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5600                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5601                         if( h->ref_count[list] > 1 ){
5602                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5603                             if(ref[list][i] >= (unsigned)h->ref_count[list]){
5604                                 av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref[list][i], h->ref_count[list]);
5605                                 return -1;
5606                             }
5607                         }else
5608                             ref[list][i] = 0;
5609                     } else {
5610                         ref[list][i] = -1;
5611                     }
5612                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5613                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5614                 }
5615         }
5616
5617         if(dct8x8_allowed)
5618             dct8x8_allowed = get_dct8x8_allowed(h);
5619
5620         for(list=0; list<h->list_count; list++){
5621             for(i=0; i<4; i++){
5622                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5623                 if(IS_DIRECT(h->sub_mb_type[i])){
5624                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5625                     continue;
5626                 }
5627
5628                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5629                     const int sub_mb_type= h->sub_mb_type[i];
5630                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5631                     for(j=0; j<sub_partition_count[i]; j++){
5632                         int mpx, mpy;
5633                         int mx, my;
5634                         const int index= 4*i + block_width*j;
5635                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5636                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5637                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5638
5639                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5640                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5641                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5642
5643                         if(IS_SUB_8X8(sub_mb_type)){
5644                             mv_cache[ 1 ][0]=
5645                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5646                             mv_cache[ 1 ][1]=
5647                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5648
5649                             mvd_cache[ 1 ][0]=
5650                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5651                             mvd_cache[ 1 ][1]=
5652                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5653                         }else if(IS_SUB_8X4(sub_mb_type)){
5654                             mv_cache[ 1 ][0]= mx;
5655                             mv_cache[ 1 ][1]= my;
5656
5657                             mvd_cache[ 1 ][0]= mx - mpx;
5658                             mvd_cache[ 1 ][1]= my - mpy;
5659                         }else if(IS_SUB_4X8(sub_mb_type)){
5660                             mv_cache[ 8 ][0]= mx;
5661                             mv_cache[ 8 ][1]= my;
5662
5663                             mvd_cache[ 8 ][0]= mx - mpx;
5664                             mvd_cache[ 8 ][1]= my - mpy;
5665                         }
5666                         mv_cache[ 0 ][0]= mx;
5667                         mv_cache[ 0 ][1]= my;
5668
5669                         mvd_cache[ 0 ][0]= mx - mpx;
5670                         mvd_cache[ 0 ][1]= my - mpy;
5671                     }
5672                 }else{
5673                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5674                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5675                     p[0] = p[1] = p[8] = p[9] = 0;
5676                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5677                 }
5678             }
5679         }
5680     } else if( IS_DIRECT(mb_type) ) {
5681         pred_direct_motion(h, &mb_type);
5682         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5683         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5684         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5685     } else {
5686         int list, mx, my, i, mpx, mpy;
5687         if(IS_16X16(mb_type)){
5688             for(list=0; list<h->list_count; list++){
5689                 if(IS_DIR(mb_type, 0, list)){
5690                     int ref;
5691                     if(h->ref_count[list] > 1){
5692                         ref= decode_cabac_mb_ref(h, list, 0);
5693                         if(ref >= (unsigned)h->ref_count[list]){
5694                             av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5695                             return -1;
5696                         }
5697                     }else
5698                         ref=0;
5699                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5700                 }else
5701                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5702             }
5703             for(list=0; list<h->list_count; list++){
5704                 if(IS_DIR(mb_type, 0, list)){
5705                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5706
5707                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5708                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5709                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5710
5711                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5712                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5713                 }else
5714                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5715             }
5716         }
5717         else if(IS_16X8(mb_type)){
5718             for(list=0; list<h->list_count; list++){
5719                     for(i=0; i<2; i++){
5720                         if(IS_DIR(mb_type, i, list)){
5721                             int ref;
5722                             if(h->ref_count[list] > 1){
5723                                 ref= decode_cabac_mb_ref( h, list, 8*i );
5724                                 if(ref >= (unsigned)h->ref_count[list]){
5725                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5726                                     return -1;
5727                                 }
5728                             }else
5729                                 ref=0;
5730                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5731                         }else
5732                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5733                     }
5734             }
5735             for(list=0; list<h->list_count; list++){
5736                 for(i=0; i<2; i++){
5737                     if(IS_DIR(mb_type, i, list)){
5738                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5739                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5740                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5741                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5742
5743                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5744                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5745                     }else{
5746                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5747                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5748                     }
5749                 }
5750             }
5751         }else{
5752             assert(IS_8X16(mb_type));
5753             for(list=0; list<h->list_count; list++){
5754                     for(i=0; i<2; i++){
5755                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5756                             int ref;
5757                             if(h->ref_count[list] > 1){
5758                                 ref= decode_cabac_mb_ref( h, list, 4*i );
5759                                 if(ref >= (unsigned)h->ref_count[list]){
5760                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5761                                     return -1;
5762                                 }
5763                             }else
5764                                 ref=0;
5765                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5766                         }else
5767                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5768                     }
5769             }
5770             for(list=0; list<h->list_count; list++){
5771                 for(i=0; i<2; i++){
5772                     if(IS_DIR(mb_type, i, list)){
5773                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5774                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5775                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5776
5777                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5778                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5779                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5780                     }else{
5781                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5782                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5783                     }
5784                 }
5785             }
5786         }
5787     }
5788
5789    if( IS_INTER( mb_type ) ) {
5790         h->chroma_pred_mode_table[mb_xy] = 0;
5791         write_back_motion( h, mb_type );
5792    }
5793
5794     if( !IS_INTRA16x16( mb_type ) ) {
5795         cbp  = decode_cabac_mb_cbp_luma( h );
5796         if(CHROMA)
5797             cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5798     }
5799
5800     h->cbp_table[mb_xy] = h->cbp = cbp;
5801
5802     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5803         if( decode_cabac_mb_transform_size( h ) )
5804             mb_type |= MB_TYPE_8x8DCT;
5805     }
5806     s->current_picture.mb_type[mb_xy]= mb_type;
5807
5808     if( cbp || IS_INTRA16x16( mb_type ) ) {
5809         const uint8_t *scan, *scan8x8, *dc_scan;
5810         const uint32_t *qmul;
5811         int dqp;
5812
5813         if(IS_INTERLACED(mb_type)){
5814             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5815             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5816             dc_scan= luma_dc_field_scan;
5817         }else{
5818             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5819             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5820             dc_scan= luma_dc_zigzag_scan;
5821         }
5822
5823         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5824         if( dqp == INT_MIN ){
5825             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5826             return -1;
5827         }
5828         s->qscale += dqp;
5829         if(((unsigned)s->qscale) > 51){
5830             if(s->qscale<0) s->qscale+= 52;
5831             else            s->qscale-= 52;
5832         }
5833         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5834         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5835
5836         if( IS_INTRA16x16( mb_type ) ) {
5837             int i;
5838             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5839             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5840
5841             if( cbp&15 ) {
5842                 qmul = h->dequant4_coeff[0][s->qscale];
5843                 for( i = 0; i < 16; i++ ) {
5844                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5845                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
5846                 }
5847             } else {
5848                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5849             }
5850         } else {
5851             int i8x8, i4x4;
5852             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5853                 if( cbp & (1<<i8x8) ) {
5854                     if( IS_8x8DCT(mb_type) ) {
5855                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
5856                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
5857                     } else {
5858                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
5859                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
5860                             const int index = 4*i8x8 + i4x4;
5861                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
5862 //START_TIMER
5863                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
5864 //STOP_TIMER("decode_residual")
5865                         }
5866                     }
5867                 } else {
5868                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5869                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5870                 }
5871             }
5872         }
5873
5874         if( cbp&0x30 ){
5875             int c;
5876             for( c = 0; c < 2; c++ ) {
5877                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
5878                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
5879             }
5880         }
5881
5882         if( cbp&0x20 ) {
5883             int c, i;
5884             for( c = 0; c < 2; c++ ) {
5885                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
5886                 for( i = 0; i < 4; i++ ) {
5887                     const int index = 16 + 4 * c + i;
5888                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
5889                     decode_cabac_residual(h, h->mb + 16*index, 4, index, scan + 1, qmul, 15);
5890                 }
5891             }
5892         } else {
5893             uint8_t * const nnz= &h->non_zero_count_cache[0];
5894             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5895             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5896         }
5897     } else {
5898         uint8_t * const nnz= &h->non_zero_count_cache[0];
5899         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5900         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5901         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5902         h->last_qscale_diff = 0;
5903     }
5904
5905     s->current_picture.qscale_table[mb_xy]= s->qscale;
5906     write_back_non_zero_count(h);
5907
5908     if(MB_MBAFF){
5909         h->ref_count[0] >>= 1;
5910         h->ref_count[1] >>= 1;
5911     }
5912
5913     return 0;
5914 }
5915
5916
5917 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5918     const int index_a = qp + h->slice_alpha_c0_offset;
5919     const int alpha = (alpha_table+52)[index_a];
5920     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5921
5922     if( bS[0] < 4 ) {
5923         int8_t tc[4];
5924         tc[0] = (tc0_table+52)[index_a][bS[0]];
5925         tc[1] = (tc0_table+52)[index_a][bS[1]];
5926         tc[2] = (tc0_table+52)[index_a][bS[2]];
5927         tc[3] = (tc0_table+52)[index_a][bS[3]];
5928         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
5929     } else {
5930         h->s.dsp.h264_h_loop_filter_luma_intra(pix, stride, alpha, beta);
5931     }
5932 }
5933 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5934     const int index_a = qp + h->slice_alpha_c0_offset;
5935     const int alpha = (alpha_table+52)[index_a];
5936     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5937
5938     if( bS[0] < 4 ) {
5939         int8_t tc[4];
5940         tc[0] = (tc0_table+52)[index_a][bS[0]]+1;
5941         tc[1] = (tc0_table+52)[index_a][bS[1]]+1;
5942         tc[2] = (tc0_table+52)[index_a][bS[2]]+1;
5943         tc[3] = (tc0_table+52)[index_a][bS[3]]+1;
5944         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
5945     } else {
5946         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
5947     }
5948 }
5949
5950 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
5951     int i;
5952     for( i = 0; i < 16; i++, pix += stride) {
5953         int index_a;
5954         int alpha;
5955         int beta;
5956
5957         int qp_index;
5958         int bS_index = (i >> 1);
5959         if (!MB_FIELD) {
5960             bS_index &= ~1;
5961             bS_index |= (i & 1);
5962         }
5963
5964         if( bS[bS_index] == 0 ) {
5965             continue;
5966         }
5967
5968         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
5969         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
5970         alpha = (alpha_table+52)[index_a];
5971         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
5972
5973         if( bS[bS_index] < 4 ) {
5974             const int tc0 = (tc0_table+52)[index_a][bS[bS_index]];
5975             const int p0 = pix[-1];
5976             const int p1 = pix[-2];
5977             const int p2 = pix[-3];
5978             const int q0 = pix[0];
5979             const int q1 = pix[1];
5980             const int q2 = pix[2];
5981
5982             if( FFABS( p0 - q0 ) < alpha &&
5983                 FFABS( p1 - p0 ) < beta &&
5984                 FFABS( q1 - q0 ) < beta ) {
5985                 int tc = tc0;
5986                 int i_delta;
5987
5988                 if( FFABS( p2 - p0 ) < beta ) {
5989                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
5990                     tc++;
5991                 }
5992                 if( FFABS( q2 - q0 ) < beta ) {
5993                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
5994                     tc++;
5995                 }
5996
5997                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
5998                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
5999                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6000                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6001             }
6002         }else{
6003             const int p0 = pix[-1];
6004             const int p1 = pix[-2];
6005             const int p2 = pix[-3];
6006
6007             const int q0 = pix[0];
6008             const int q1 = pix[1];
6009             const int q2 = pix[2];
6010
6011             if( FFABS( p0 - q0 ) < alpha &&
6012                 FFABS( p1 - p0 ) < beta &&
6013                 FFABS( q1 - q0 ) < beta ) {
6014
6015                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6016                     if( FFABS( p2 - p0 ) < beta)
6017                     {
6018                         const int p3 = pix[-4];
6019                         /* p0', p1', p2' */
6020                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6021                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6022                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6023                     } else {
6024                         /* p0' */
6025                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6026                     }
6027                     if( FFABS( q2 - q0 ) < beta)
6028                     {
6029                         const int q3 = pix[3];
6030                         /* q0', q1', q2' */
6031                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6032                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6033                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6034                     } else {
6035                         /* q0' */
6036                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6037                     }
6038                 }else{
6039                     /* p0', q0' */
6040                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6041                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6042                 }
6043                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6044             }
6045         }
6046     }
6047 }
6048 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6049     int i;
6050     for( i = 0; i < 8; i++, pix += stride) {
6051         int index_a;
6052         int alpha;
6053         int beta;
6054
6055         int qp_index;
6056         int bS_index = i;
6057
6058         if( bS[bS_index] == 0 ) {
6059             continue;
6060         }
6061
6062         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6063         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6064         alpha = (alpha_table+52)[index_a];
6065         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6066
6067         if( bS[bS_index] < 4 ) {
6068             const int tc = (tc0_table+52)[index_a][bS[bS_index]] + 1;
6069             const int p0 = pix[-1];
6070             const int p1 = pix[-2];
6071             const int q0 = pix[0];
6072             const int q1 = pix[1];
6073
6074             if( FFABS( p0 - q0 ) < alpha &&
6075                 FFABS( p1 - p0 ) < beta &&
6076                 FFABS( q1 - q0 ) < beta ) {
6077                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6078
6079                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6080                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6081                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6082             }
6083         }else{
6084             const int p0 = pix[-1];
6085             const int p1 = pix[-2];
6086             const int q0 = pix[0];
6087             const int q1 = pix[1];
6088
6089             if( FFABS( p0 - q0 ) < alpha &&
6090                 FFABS( p1 - p0 ) < beta &&
6091                 FFABS( q1 - q0 ) < beta ) {
6092
6093                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6094                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6095                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6096             }
6097         }
6098     }
6099 }
6100
6101 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6102     const int index_a = qp + h->slice_alpha_c0_offset;
6103     const int alpha = (alpha_table+52)[index_a];
6104     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6105
6106     if( bS[0] < 4 ) {
6107         int8_t tc[4];
6108         tc[0] = (tc0_table+52)[index_a][bS[0]];
6109         tc[1] = (tc0_table+52)[index_a][bS[1]];
6110         tc[2] = (tc0_table+52)[index_a][bS[2]];
6111         tc[3] = (tc0_table+52)[index_a][bS[3]];
6112         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6113     } else {
6114         h->s.dsp.h264_v_loop_filter_luma_intra(pix, stride, alpha, beta);
6115     }
6116 }
6117
6118 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6119     const int index_a = qp + h->slice_alpha_c0_offset;
6120     const int alpha = (alpha_table+52)[index_a];
6121     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6122
6123     if( bS[0] < 4 ) {
6124         int8_t tc[4];
6125         tc[0] = (tc0_table+52)[index_a][bS[0]]+1;
6126         tc[1] = (tc0_table+52)[index_a][bS[1]]+1;
6127         tc[2] = (tc0_table+52)[index_a][bS[2]]+1;
6128         tc[3] = (tc0_table+52)[index_a][bS[3]]+1;
6129         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6130     } else {
6131         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6132     }
6133 }
6134
6135 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6136     MpegEncContext * const s = &h->s;
6137     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6138     int mb_xy, mb_type;
6139     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6140
6141     mb_xy = h->mb_xy;
6142
6143     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6144         !(s->flags2 & CODEC_FLAG2_FAST) || //FIXME filter_mb_fast is broken, thus hasto be, but should not under CODEC_FLAG2_FAST
6145        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6146                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6147         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6148         return;
6149     }
6150     assert(!FRAME_MBAFF);
6151
6152     mb_type = s->current_picture.mb_type[mb_xy];
6153     qp = s->current_picture.qscale_table[mb_xy];
6154     qp0 = s->current_picture.qscale_table[mb_xy-1];
6155     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6156     qpc = get_chroma_qp( h, 0, qp );
6157     qpc0 = get_chroma_qp( h, 0, qp0 );
6158     qpc1 = get_chroma_qp( h, 0, qp1 );
6159     qp0 = (qp + qp0 + 1) >> 1;
6160     qp1 = (qp + qp1 + 1) >> 1;
6161     qpc0 = (qpc + qpc0 + 1) >> 1;
6162     qpc1 = (qpc + qpc1 + 1) >> 1;
6163     qp_thresh = 15 - h->slice_alpha_c0_offset;
6164     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6165        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6166         return;
6167
6168     if( IS_INTRA(mb_type) ) {
6169         int16_t bS4[4] = {4,4,4,4};
6170         int16_t bS3[4] = {3,3,3,3};
6171         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6172         if( IS_8x8DCT(mb_type) ) {
6173             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6174             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6175             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6176             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6177         } else {
6178             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6179             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6180             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6181             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6182             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6183             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6184             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6185             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6186         }
6187         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6188         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6189         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6190         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6191         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6192         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6193         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6194         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6195         return;
6196     } else {
6197         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6198         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6199         int edges;
6200         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6201             edges = 4;
6202             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6203         } else {
6204             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6205                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6206             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6207                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6208                              ? 3 : 0;
6209             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6210             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6211             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6212                                               (h->slice_type_nos == FF_B_TYPE), edges, step, mask_edge0, mask_edge1, FIELD_PICTURE);
6213         }
6214         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6215             bSv[0][0] = 0x0004000400040004ULL;
6216         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6217             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6218
6219 #define FILTER(hv,dir,edge)\
6220         if(bSv[dir][edge]) {\
6221             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6222             if(!(edge&1)) {\
6223                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6224                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6225             }\
6226         }
6227         if( edges == 1 ) {
6228             FILTER(v,0,0);
6229             FILTER(h,1,0);
6230         } else if( IS_8x8DCT(mb_type) ) {
6231             FILTER(v,0,0);
6232             FILTER(v,0,2);
6233             FILTER(h,1,0);
6234             FILTER(h,1,2);
6235         } else {
6236             FILTER(v,0,0);
6237             FILTER(v,0,1);
6238             FILTER(v,0,2);
6239             FILTER(v,0,3);
6240             FILTER(h,1,0);
6241             FILTER(h,1,1);
6242             FILTER(h,1,2);
6243             FILTER(h,1,3);
6244         }
6245 #undef FILTER
6246     }
6247 }
6248
6249
6250 static void av_always_inline filter_mb_dir(H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int mb_xy, int mb_type, int mvy_limit, int first_vertical_edge_done, int dir) {
6251     MpegEncContext * const s = &h->s;
6252     int edge;
6253     const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6254     const int mbm_type = s->current_picture.mb_type[mbm_xy];
6255     int (*ref2frm) [64] = h->ref2frm[ h->slice_num          &(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6256     int (*ref2frmm)[64] = h->ref2frm[ h->slice_table[mbm_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6257     int start = h->slice_table[mbm_xy] == 0xFFFF ? 1 : 0;
6258
6259     const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6260                               == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6261     // how often to recheck mv-based bS when iterating between edges
6262     const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6263                           (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6264     // how often to recheck mv-based bS when iterating along each edge
6265     const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6266
6267     if (first_vertical_edge_done) {
6268         start = 1;
6269     }
6270
6271     if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6272         start = 1;
6273
6274     if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6275         && !IS_INTERLACED(mb_type)
6276         && IS_INTERLACED(mbm_type)
6277         ) {
6278         // This is a special case in the norm where the filtering must
6279         // be done twice (one each of the field) even if we are in a
6280         // frame macroblock.
6281         //
6282         static const int nnz_idx[4] = {4,5,6,3};
6283         unsigned int tmp_linesize   = 2 *   linesize;
6284         unsigned int tmp_uvlinesize = 2 * uvlinesize;
6285         int mbn_xy = mb_xy - 2 * s->mb_stride;
6286         int qp;
6287         int i, j;
6288         int16_t bS[4];
6289
6290         for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6291             if( IS_INTRA(mb_type) ||
6292                 IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6293                 bS[0] = bS[1] = bS[2] = bS[3] = 3;
6294             } else {
6295                 const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6296                 for( i = 0; i < 4; i++ ) {
6297                     if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6298                         mbn_nnz[nnz_idx[i]] != 0 )
6299                         bS[i] = 2;
6300                     else
6301                         bS[i] = 1;
6302                 }
6303             }
6304             // Do not use s->qscale as luma quantizer because it has not the same
6305             // value in IPCM macroblocks.
6306             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6307             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6308             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6309             filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6310             filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6311                               ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6312             filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6313                               ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6314         }
6315
6316         start = 1;
6317     }
6318
6319     /* Calculate bS */
6320     for( edge = start; edge < edges; edge++ ) {
6321         /* mbn_xy: neighbor macroblock */
6322         const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6323         const int mbn_type = s->current_picture.mb_type[mbn_xy];
6324         int (*ref2frmn)[64] = edge > 0 ? ref2frm : ref2frmm;
6325         int16_t bS[4];
6326         int qp;
6327
6328         if( (edge&1) && IS_8x8DCT(mb_type) )
6329             continue;
6330
6331         if( IS_INTRA(mb_type) ||
6332             IS_INTRA(mbn_type) ) {
6333             int value;
6334             if (edge == 0) {
6335                 if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6336                     || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6337                 ) {
6338                     value = 4;
6339                 } else {
6340                     value = 3;
6341                 }
6342             } else {
6343                 value = 3;
6344             }
6345             bS[0] = bS[1] = bS[2] = bS[3] = value;
6346         } else {
6347             int i, l;
6348             int mv_done;
6349
6350             if( edge & mask_edge ) {
6351                 bS[0] = bS[1] = bS[2] = bS[3] = 0;
6352                 mv_done = 1;
6353             }
6354             else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6355                 bS[0] = bS[1] = bS[2] = bS[3] = 1;
6356                 mv_done = 1;
6357             }
6358             else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6359                 int b_idx= 8 + 4 + edge * (dir ? 8:1);
6360                 int bn_idx= b_idx - (dir ? 8:1);
6361                 int v = 0;
6362
6363                 for( l = 0; !v && l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6364                     v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6365                          FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6366                          FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6367                 }
6368
6369                 if(h->slice_type_nos == FF_B_TYPE && v){
6370                     v=0;
6371                     for( l = 0; !v && l < 2; l++ ) {
6372                         int ln= 1-l;
6373                         v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6374                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6375                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit;
6376                     }
6377                 }
6378
6379                 bS[0] = bS[1] = bS[2] = bS[3] = v;
6380                 mv_done = 1;
6381             }
6382             else
6383                 mv_done = 0;
6384
6385             for( i = 0; i < 4; i++ ) {
6386                 int x = dir == 0 ? edge : i;
6387                 int y = dir == 0 ? i    : edge;
6388                 int b_idx= 8 + 4 + x + 8*y;
6389                 int bn_idx= b_idx - (dir ? 8:1);
6390
6391                 if( h->non_zero_count_cache[b_idx] |
6392                     h->non_zero_count_cache[bn_idx] ) {
6393                     bS[i] = 2;
6394                 }
6395                 else if(!mv_done)
6396                 {
6397                     bS[i] = 0;
6398                     for( l = 0; l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6399                         if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6400                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6401                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6402                             bS[i] = 1;
6403                             break;
6404                         }
6405                     }
6406
6407                     if(h->slice_type_nos == FF_B_TYPE && bS[i]){
6408                         bS[i] = 0;
6409                         for( l = 0; l < 2; l++ ) {
6410                             int ln= 1-l;
6411                             if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6412                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6413                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) {
6414                                 bS[i] = 1;
6415                                 break;
6416                             }
6417                         }
6418                     }
6419                 }
6420             }
6421
6422             if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6423                 continue;
6424         }
6425
6426         /* Filter edge */
6427         // Do not use s->qscale as luma quantizer because it has not the same
6428         // value in IPCM macroblocks.
6429         qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6430         //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6431         tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6432         { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6433         if( dir == 0 ) {
6434             filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6435             if( (edge&1) == 0 ) {
6436                 filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6437                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6438                 filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6439                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6440             }
6441         } else {
6442             filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6443             if( (edge&1) == 0 ) {
6444                 filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6445                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6446                 filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6447                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6448             }
6449         }
6450     }
6451 }
6452
6453 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6454     MpegEncContext * const s = &h->s;
6455     const int mb_xy= mb_x + mb_y*s->mb_stride;
6456     const int mb_type = s->current_picture.mb_type[mb_xy];
6457     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6458     int first_vertical_edge_done = 0;
6459     int dir;
6460
6461     //for sufficiently low qp, filtering wouldn't do anything
6462     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6463     if(!FRAME_MBAFF){
6464         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
6465         int qp = s->current_picture.qscale_table[mb_xy];
6466         if(qp <= qp_thresh
6467            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6468            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6469             return;
6470         }
6471     }
6472
6473     // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
6474     if(!h->pps.cabac && h->pps.transform_8x8_mode){
6475         int top_type, left_type[2];
6476         top_type     = s->current_picture.mb_type[h->top_mb_xy]    ;
6477         left_type[0] = s->current_picture.mb_type[h->left_mb_xy[0]];
6478         left_type[1] = s->current_picture.mb_type[h->left_mb_xy[1]];
6479
6480         if(IS_8x8DCT(top_type)){
6481             h->non_zero_count_cache[4+8*0]=
6482             h->non_zero_count_cache[5+8*0]= h->cbp_table[h->top_mb_xy] & 4;
6483             h->non_zero_count_cache[6+8*0]=
6484             h->non_zero_count_cache[7+8*0]= h->cbp_table[h->top_mb_xy] & 8;
6485         }
6486         if(IS_8x8DCT(left_type[0])){
6487             h->non_zero_count_cache[3+8*1]=
6488             h->non_zero_count_cache[3+8*2]= h->cbp_table[h->left_mb_xy[0]]&2; //FIXME check MBAFF
6489         }
6490         if(IS_8x8DCT(left_type[1])){
6491             h->non_zero_count_cache[3+8*3]=
6492             h->non_zero_count_cache[3+8*4]= h->cbp_table[h->left_mb_xy[1]]&8; //FIXME check MBAFF
6493         }
6494
6495         if(IS_8x8DCT(mb_type)){
6496             h->non_zero_count_cache[scan8[0   ]]= h->non_zero_count_cache[scan8[1   ]]=
6497             h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp & 1;
6498
6499             h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
6500             h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp & 2;
6501
6502             h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
6503             h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp & 4;
6504
6505             h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
6506             h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp & 8;
6507         }
6508     }
6509
6510     if (FRAME_MBAFF
6511             // left mb is in picture
6512             && h->slice_table[mb_xy-1] != 0xFFFF
6513             // and current and left pair do not have the same interlaced type
6514             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6515             // and left mb is in the same slice if deblocking_filter == 2
6516             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6517         /* First vertical edge is different in MBAFF frames
6518          * There are 8 different bS to compute and 2 different Qp
6519          */
6520         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6521         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6522         int16_t bS[8];
6523         int qp[2];
6524         int bqp[2];
6525         int rqp[2];
6526         int mb_qp, mbn0_qp, mbn1_qp;
6527         int i;
6528         first_vertical_edge_done = 1;
6529
6530         if( IS_INTRA(mb_type) )
6531             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6532         else {
6533             for( i = 0; i < 8; i++ ) {
6534                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6535
6536                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6537                     bS[i] = 4;
6538                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6539                          ((!h->pps.cabac && IS_8x8DCT(s->current_picture.mb_type[mbn_xy])) ?
6540                             (h->cbp_table[mbn_xy] & ((MB_FIELD ? (i&2) : (mb_y&1)) ? 8 : 2))
6541                                                                        :
6542                             h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2]))
6543                     bS[i] = 2;
6544                 else
6545                     bS[i] = 1;
6546             }
6547         }
6548
6549         mb_qp = s->current_picture.qscale_table[mb_xy];
6550         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6551         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6552         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6553         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6554                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6555         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6556                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6557         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6558         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6559                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6560         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6561                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6562
6563         /* Filter edge */
6564         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6565         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6566         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6567         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6568         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6569     }
6570
6571 #ifdef CONFIG_SMALL
6572     for( dir = 0; dir < 2; dir++ )
6573         filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, dir ? 0 : first_vertical_edge_done, dir);
6574 #else
6575     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, first_vertical_edge_done, 0);
6576     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, 0, 1);
6577 #endif
6578 }
6579
6580 static int decode_slice(struct AVCodecContext *avctx, void *arg){
6581     H264Context *h = *(void**)arg;
6582     MpegEncContext * const s = &h->s;
6583     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6584
6585     s->mb_skip_run= -1;
6586
6587     h->is_complex = FRAME_MBAFF || s->picture_structure != PICT_FRAME || s->codec_id != CODEC_ID_H264 ||
6588                     (ENABLE_GRAY && (s->flags&CODEC_FLAG_GRAY)) || (ENABLE_H264_ENCODER && s->encoding);
6589
6590     if( h->pps.cabac ) {
6591         int i;
6592
6593         /* realign */
6594         align_get_bits( &s->gb );
6595
6596         /* init cabac */
6597         ff_init_cabac_states( &h->cabac);
6598         ff_init_cabac_decoder( &h->cabac,
6599                                s->gb.buffer + get_bits_count(&s->gb)/8,
6600                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6601         /* calculate pre-state */
6602         for( i= 0; i < 460; i++ ) {
6603             int pre;
6604             if( h->slice_type_nos == FF_I_TYPE )
6605                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6606             else
6607                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6608
6609             if( pre <= 63 )
6610                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6611             else
6612                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6613         }
6614
6615         for(;;){
6616 //START_TIMER
6617             int ret = decode_mb_cabac(h);
6618             int eos;
6619 //STOP_TIMER("decode_mb_cabac")
6620
6621             if(ret>=0) hl_decode_mb(h);
6622
6623             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6624                 s->mb_y++;
6625
6626                 ret = decode_mb_cabac(h);
6627
6628                 if(ret>=0) hl_decode_mb(h);
6629                 s->mb_y--;
6630             }
6631             eos = get_cabac_terminate( &h->cabac );
6632
6633             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6634                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6635                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6636                 return -1;
6637             }
6638
6639             if( ++s->mb_x >= s->mb_width ) {
6640                 s->mb_x = 0;
6641                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6642                 ++s->mb_y;
6643                 if(FIELD_OR_MBAFF_PICTURE) {
6644                     ++s->mb_y;
6645                 }
6646             }
6647
6648             if( eos || s->mb_y >= s->mb_height ) {
6649                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6650                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6651                 return 0;
6652             }
6653         }
6654
6655     } else {
6656         for(;;){
6657             int ret = decode_mb_cavlc(h);
6658
6659             if(ret>=0) hl_decode_mb(h);
6660
6661             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6662                 s->mb_y++;
6663                 ret = decode_mb_cavlc(h);
6664
6665                 if(ret>=0) hl_decode_mb(h);
6666                 s->mb_y--;
6667             }
6668
6669             if(ret<0){
6670                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6671                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6672
6673                 return -1;
6674             }
6675
6676             if(++s->mb_x >= s->mb_width){
6677                 s->mb_x=0;
6678                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6679                 ++s->mb_y;
6680                 if(FIELD_OR_MBAFF_PICTURE) {
6681                     ++s->mb_y;
6682                 }
6683                 if(s->mb_y >= s->mb_height){
6684                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6685
6686                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6687                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6688
6689                         return 0;
6690                     }else{
6691                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6692
6693                         return -1;
6694                     }
6695                 }
6696             }
6697
6698             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6699                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6700                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6701                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6702
6703                     return 0;
6704                 }else{
6705                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6706
6707                     return -1;
6708                 }
6709             }
6710         }
6711     }
6712
6713 #if 0
6714     for(;s->mb_y < s->mb_height; s->mb_y++){
6715         for(;s->mb_x < s->mb_width; s->mb_x++){
6716             int ret= decode_mb(h);
6717
6718             hl_decode_mb(h);
6719
6720             if(ret<0){
6721                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6722                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6723
6724                 return -1;
6725             }
6726
6727             if(++s->mb_x >= s->mb_width){
6728                 s->mb_x=0;
6729                 if(++s->mb_y >= s->mb_height){
6730                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6731                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6732
6733                         return 0;
6734                     }else{
6735                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6736
6737                         return -1;
6738                     }
6739                 }
6740             }
6741
6742             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6743                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6744                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6745
6746                     return 0;
6747                 }else{
6748                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6749
6750                     return -1;
6751                 }
6752             }
6753         }
6754         s->mb_x=0;
6755         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6756     }
6757 #endif
6758     return -1; //not reached
6759 }
6760
6761 static int decode_picture_timing(H264Context *h){
6762     MpegEncContext * const s = &h->s;
6763     if(h->sps.nal_hrd_parameters_present_flag || h->sps.vcl_hrd_parameters_present_flag){
6764         skip_bits(&s->gb, h->sps.cpb_removal_delay_length); /* cpb_removal_delay */
6765         skip_bits(&s->gb, h->sps.dpb_output_delay_length);  /* dpb_output_delay */
6766     }
6767     if(h->sps.pic_struct_present_flag){
6768         unsigned int i, num_clock_ts;
6769         h->sei_pic_struct = get_bits(&s->gb, 4);
6770
6771         if (h->sei_pic_struct > SEI_PIC_STRUCT_FRAME_TRIPLING)
6772             return -1;
6773
6774         num_clock_ts = sei_num_clock_ts_table[h->sei_pic_struct];
6775
6776         for (i = 0 ; i < num_clock_ts ; i++){
6777             if(get_bits(&s->gb, 1)){                  /* clock_timestamp_flag */
6778                 unsigned int full_timestamp_flag;
6779                 skip_bits(&s->gb, 2);                 /* ct_type */
6780                 skip_bits(&s->gb, 1);                 /* nuit_field_based_flag */
6781                 skip_bits(&s->gb, 5);                 /* counting_type */
6782                 full_timestamp_flag = get_bits(&s->gb, 1);
6783                 skip_bits(&s->gb, 1);                 /* discontinuity_flag */
6784                 skip_bits(&s->gb, 1);                 /* cnt_dropped_flag */
6785                 skip_bits(&s->gb, 8);                 /* n_frames */
6786                 if(full_timestamp_flag){
6787                     skip_bits(&s->gb, 6);             /* seconds_value 0..59 */
6788                     skip_bits(&s->gb, 6);             /* minutes_value 0..59 */
6789                     skip_bits(&s->gb, 5);             /* hours_value 0..23 */
6790                 }else{
6791                     if(get_bits(&s->gb, 1)){          /* seconds_flag */
6792                         skip_bits(&s->gb, 6);         /* seconds_value range 0..59 */
6793                         if(get_bits(&s->gb, 1)){      /* minutes_flag */
6794                             skip_bits(&s->gb, 6);     /* minutes_value 0..59 */
6795                             if(get_bits(&s->gb, 1))   /* hours_flag */
6796                                 skip_bits(&s->gb, 5); /* hours_value 0..23 */
6797                         }
6798                     }
6799                 }
6800                 if(h->sps.time_offset_length > 0)
6801                     skip_bits(&s->gb, h->sps.time_offset_length); /* time_offset */
6802             }
6803         }
6804     }
6805     return 0;
6806 }
6807
6808 static int decode_unregistered_user_data(H264Context *h, int size){
6809     MpegEncContext * const s = &h->s;
6810     uint8_t user_data[16+256];
6811     int e, build, i;
6812
6813     if(size<16)
6814         return -1;
6815
6816     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6817         user_data[i]= get_bits(&s->gb, 8);
6818     }
6819
6820     user_data[i]= 0;
6821     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6822     if(e==1 && build>=0)
6823         h->x264_build= build;
6824
6825     if(s->avctx->debug & FF_DEBUG_BUGS)
6826         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6827
6828     for(; i<size; i++)
6829         skip_bits(&s->gb, 8);
6830
6831     return 0;
6832 }
6833
6834 static int decode_sei(H264Context *h){
6835     MpegEncContext * const s = &h->s;
6836
6837     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6838         int size, type;
6839
6840         type=0;
6841         do{
6842             type+= show_bits(&s->gb, 8);
6843         }while(get_bits(&s->gb, 8) == 255);
6844
6845         size=0;
6846         do{
6847             size+= show_bits(&s->gb, 8);
6848         }while(get_bits(&s->gb, 8) == 255);
6849
6850         switch(type){
6851         case 1: // Picture timing SEI
6852             if(decode_picture_timing(h) < 0)
6853                 return -1;
6854             break;
6855         case 5:
6856             if(decode_unregistered_user_data(h, size) < 0)
6857                 return -1;
6858             break;
6859         default:
6860             skip_bits(&s->gb, 8*size);
6861         }
6862
6863         //FIXME check bits here
6864         align_get_bits(&s->gb);
6865     }
6866
6867     return 0;
6868 }
6869
6870 static inline int decode_hrd_parameters(H264Context *h, SPS *sps){
6871     MpegEncContext * const s = &h->s;
6872     int cpb_count, i;
6873     cpb_count = get_ue_golomb_31(&s->gb) + 1;
6874
6875     if(cpb_count > 32U){
6876         av_log(h->s.avctx, AV_LOG_ERROR, "cpb_count %d invalid\n", cpb_count);
6877         return -1;
6878     }
6879
6880     get_bits(&s->gb, 4); /* bit_rate_scale */
6881     get_bits(&s->gb, 4); /* cpb_size_scale */
6882     for(i=0; i<cpb_count; i++){
6883         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
6884         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
6885         get_bits1(&s->gb);     /* cbr_flag */
6886     }
6887     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
6888     sps->cpb_removal_delay_length = get_bits(&s->gb, 5) + 1;
6889     sps->dpb_output_delay_length = get_bits(&s->gb, 5) + 1;
6890     sps->time_offset_length = get_bits(&s->gb, 5);
6891     return 0;
6892 }
6893
6894 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
6895     MpegEncContext * const s = &h->s;
6896     int aspect_ratio_info_present_flag;
6897     unsigned int aspect_ratio_idc;
6898
6899     aspect_ratio_info_present_flag= get_bits1(&s->gb);
6900
6901     if( aspect_ratio_info_present_flag ) {
6902         aspect_ratio_idc= get_bits(&s->gb, 8);
6903         if( aspect_ratio_idc == EXTENDED_SAR ) {
6904             sps->sar.num= get_bits(&s->gb, 16);
6905             sps->sar.den= get_bits(&s->gb, 16);
6906         }else if(aspect_ratio_idc < FF_ARRAY_ELEMS(pixel_aspect)){
6907             sps->sar=  pixel_aspect[aspect_ratio_idc];
6908         }else{
6909             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
6910             return -1;
6911         }
6912     }else{
6913         sps->sar.num=
6914         sps->sar.den= 0;
6915     }
6916 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
6917
6918     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
6919         get_bits1(&s->gb);      /* overscan_appropriate_flag */
6920     }
6921
6922     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
6923         get_bits(&s->gb, 3);    /* video_format */
6924         get_bits1(&s->gb);      /* video_full_range_flag */
6925         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
6926             get_bits(&s->gb, 8); /* colour_primaries */
6927             get_bits(&s->gb, 8); /* transfer_characteristics */
6928             get_bits(&s->gb, 8); /* matrix_coefficients */
6929         }
6930     }
6931
6932     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
6933         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
6934         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
6935     }
6936
6937     sps->timing_info_present_flag = get_bits1(&s->gb);
6938     if(sps->timing_info_present_flag){
6939         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
6940         sps->time_scale = get_bits_long(&s->gb, 32);
6941         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
6942     }
6943
6944     sps->nal_hrd_parameters_present_flag = get_bits1(&s->gb);
6945     if(sps->nal_hrd_parameters_present_flag)
6946         if(decode_hrd_parameters(h, sps) < 0)
6947             return -1;
6948     sps->vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
6949     if(sps->vcl_hrd_parameters_present_flag)
6950         if(decode_hrd_parameters(h, sps) < 0)
6951             return -1;
6952     if(sps->nal_hrd_parameters_present_flag || sps->vcl_hrd_parameters_present_flag)
6953         get_bits1(&s->gb);     /* low_delay_hrd_flag */
6954     sps->pic_struct_present_flag = get_bits1(&s->gb);
6955
6956     sps->bitstream_restriction_flag = get_bits1(&s->gb);
6957     if(sps->bitstream_restriction_flag){
6958         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
6959         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
6960         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
6961         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
6962         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
6963         sps->num_reorder_frames= get_ue_golomb(&s->gb);
6964         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
6965
6966         if(sps->num_reorder_frames > 16U /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
6967             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", sps->num_reorder_frames);
6968             return -1;
6969         }
6970     }
6971
6972     return 0;
6973 }
6974
6975 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
6976                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
6977     MpegEncContext * const s = &h->s;
6978     int i, last = 8, next = 8;
6979     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
6980     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
6981         memcpy(factors, fallback_list, size*sizeof(uint8_t));
6982     else
6983     for(i=0;i<size;i++){
6984         if(next)
6985             next = (last + get_se_golomb(&s->gb)) & 0xff;
6986         if(!i && !next){ /* matrix not written, we use the preset one */
6987             memcpy(factors, jvt_list, size*sizeof(uint8_t));
6988             break;
6989         }
6990         last = factors[scan[i]] = next ? next : last;
6991     }
6992 }
6993
6994 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
6995                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
6996     MpegEncContext * const s = &h->s;
6997     int fallback_sps = !is_sps && sps->scaling_matrix_present;
6998     const uint8_t *fallback[4] = {
6999         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7000         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7001         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7002         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7003     };
7004     if(get_bits1(&s->gb)){
7005         sps->scaling_matrix_present |= is_sps;
7006         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7007         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7008         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7009         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7010         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7011         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7012         if(is_sps || pps->transform_8x8_mode){
7013             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7014             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7015         }
7016     }
7017 }
7018
7019 static inline int decode_seq_parameter_set(H264Context *h){
7020     MpegEncContext * const s = &h->s;
7021     int profile_idc, level_idc;
7022     unsigned int sps_id;
7023     int i;
7024     SPS *sps;
7025
7026     profile_idc= get_bits(&s->gb, 8);
7027     get_bits1(&s->gb);   //constraint_set0_flag
7028     get_bits1(&s->gb);   //constraint_set1_flag
7029     get_bits1(&s->gb);   //constraint_set2_flag
7030     get_bits1(&s->gb);   //constraint_set3_flag
7031     get_bits(&s->gb, 4); // reserved
7032     level_idc= get_bits(&s->gb, 8);
7033     sps_id= get_ue_golomb_31(&s->gb);
7034
7035     if(sps_id >= MAX_SPS_COUNT) {
7036         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id (%d) out of range\n", sps_id);
7037         return -1;
7038     }
7039     sps= av_mallocz(sizeof(SPS));
7040     if(sps == NULL)
7041         return -1;
7042
7043     sps->profile_idc= profile_idc;
7044     sps->level_idc= level_idc;
7045
7046     memset(sps->scaling_matrix4, 16, sizeof(sps->scaling_matrix4));
7047     memset(sps->scaling_matrix8, 16, sizeof(sps->scaling_matrix8));
7048     sps->scaling_matrix_present = 0;
7049
7050     if(sps->profile_idc >= 100){ //high profile
7051         sps->chroma_format_idc= get_ue_golomb_31(&s->gb);
7052         if(sps->chroma_format_idc == 3)
7053             get_bits1(&s->gb);  //residual_color_transform_flag
7054         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
7055         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
7056         sps->transform_bypass = get_bits1(&s->gb);
7057         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7058     }else{
7059         sps->chroma_format_idc= 1;
7060     }
7061
7062     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7063     sps->poc_type= get_ue_golomb_31(&s->gb);
7064
7065     if(sps->poc_type == 0){ //FIXME #define
7066         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7067     } else if(sps->poc_type == 1){//FIXME #define
7068         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7069         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7070         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7071         sps->poc_cycle_length                = get_ue_golomb(&s->gb);
7072
7073         if((unsigned)sps->poc_cycle_length >= FF_ARRAY_ELEMS(sps->offset_for_ref_frame)){
7074             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", sps->poc_cycle_length);
7075             goto fail;
7076         }
7077
7078         for(i=0; i<sps->poc_cycle_length; i++)
7079             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7080     }else if(sps->poc_type != 2){
7081         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7082         goto fail;
7083     }
7084
7085     sps->ref_frame_count= get_ue_golomb_31(&s->gb);
7086     if(sps->ref_frame_count > MAX_PICTURE_COUNT-2 || sps->ref_frame_count >= 32U){
7087         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7088         goto fail;
7089     }
7090     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7091     sps->mb_width = get_ue_golomb(&s->gb) + 1;
7092     sps->mb_height= get_ue_golomb(&s->gb) + 1;
7093     if((unsigned)sps->mb_width >= INT_MAX/16 || (unsigned)sps->mb_height >= INT_MAX/16 ||
7094        avcodec_check_dimensions(NULL, 16*sps->mb_width, 16*sps->mb_height)){
7095         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7096         goto fail;
7097     }
7098
7099     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7100     if(!sps->frame_mbs_only_flag)
7101         sps->mb_aff= get_bits1(&s->gb);
7102     else
7103         sps->mb_aff= 0;
7104
7105     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7106
7107 #ifndef ALLOW_INTERLACE
7108     if(sps->mb_aff)
7109         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7110 #endif
7111     sps->crop= get_bits1(&s->gb);
7112     if(sps->crop){
7113         sps->crop_left  = get_ue_golomb(&s->gb);
7114         sps->crop_right = get_ue_golomb(&s->gb);
7115         sps->crop_top   = get_ue_golomb(&s->gb);
7116         sps->crop_bottom= get_ue_golomb(&s->gb);
7117         if(sps->crop_left || sps->crop_top){
7118             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7119         }
7120         if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !sps->frame_mbs_only_flag)){
7121             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
7122         }
7123     }else{
7124         sps->crop_left  =
7125         sps->crop_right =
7126         sps->crop_top   =
7127         sps->crop_bottom= 0;
7128     }
7129
7130     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7131     if( sps->vui_parameters_present_flag )
7132         decode_vui_parameters(h, sps);
7133
7134     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7135         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s %s\n",
7136                sps_id, sps->profile_idc, sps->level_idc,
7137                sps->poc_type,
7138                sps->ref_frame_count,
7139                sps->mb_width, sps->mb_height,
7140                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7141                sps->direct_8x8_inference_flag ? "8B8" : "",
7142                sps->crop_left, sps->crop_right,
7143                sps->crop_top, sps->crop_bottom,
7144                sps->vui_parameters_present_flag ? "VUI" : "",
7145                ((const char*[]){"Gray","420","422","444"})[sps->chroma_format_idc]
7146                );
7147     }
7148     av_free(h->sps_buffers[sps_id]);
7149     h->sps_buffers[sps_id]= sps;
7150     return 0;
7151 fail:
7152     av_free(sps);
7153     return -1;
7154 }
7155
7156 static void
7157 build_qp_table(PPS *pps, int t, int index)
7158 {
7159     int i;
7160     for(i = 0; i < 52; i++)
7161         pps->chroma_qp_table[t][i] = chroma_qp[av_clip(i + index, 0, 51)];
7162 }
7163
7164 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7165     MpegEncContext * const s = &h->s;
7166     unsigned int pps_id= get_ue_golomb(&s->gb);
7167     PPS *pps;
7168
7169     if(pps_id >= MAX_PPS_COUNT) {
7170         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id (%d) out of range\n", pps_id);
7171         return -1;
7172     }
7173
7174     pps= av_mallocz(sizeof(PPS));
7175     if(pps == NULL)
7176         return -1;
7177     pps->sps_id= get_ue_golomb_31(&s->gb);
7178     if((unsigned)pps->sps_id>=MAX_SPS_COUNT || h->sps_buffers[pps->sps_id] == NULL){
7179         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7180         goto fail;
7181     }
7182
7183     pps->cabac= get_bits1(&s->gb);
7184     pps->pic_order_present= get_bits1(&s->gb);
7185     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7186     if(pps->slice_group_count > 1 ){
7187         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7188         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7189         switch(pps->mb_slice_group_map_type){
7190         case 0:
7191 #if 0
7192 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7193 |    run_length[ i ]                                |1  |ue(v)   |
7194 #endif
7195             break;
7196         case 2:
7197 #if 0
7198 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7199 |{                                                  |   |        |
7200 |    top_left_mb[ i ]                               |1  |ue(v)   |
7201 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7202 |   }                                               |   |        |
7203 #endif
7204             break;
7205         case 3:
7206         case 4:
7207         case 5:
7208 #if 0
7209 |   slice_group_change_direction_flag               |1  |u(1)    |
7210 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7211 #endif
7212             break;
7213         case 6:
7214 #if 0
7215 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7216 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7217 |)                                                  |   |        |
7218 |    slice_group_id[ i ]                            |1  |u(v)    |
7219 #endif
7220             break;
7221         }
7222     }
7223     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7224     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7225     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7226         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7227         goto fail;
7228     }
7229
7230     pps->weighted_pred= get_bits1(&s->gb);
7231     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7232     pps->init_qp= get_se_golomb(&s->gb) + 26;
7233     pps->init_qs= get_se_golomb(&s->gb) + 26;
7234     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7235     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7236     pps->constrained_intra_pred= get_bits1(&s->gb);
7237     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7238
7239     pps->transform_8x8_mode= 0;
7240     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7241     memcpy(pps->scaling_matrix4, h->sps_buffers[pps->sps_id]->scaling_matrix4, sizeof(pps->scaling_matrix4));
7242     memcpy(pps->scaling_matrix8, h->sps_buffers[pps->sps_id]->scaling_matrix8, sizeof(pps->scaling_matrix8));
7243
7244     if(get_bits_count(&s->gb) < bit_length){
7245         pps->transform_8x8_mode= get_bits1(&s->gb);
7246         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7247         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7248     } else {
7249         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7250     }
7251
7252     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7253     build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7254     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1])
7255         h->pps.chroma_qp_diff= 1;
7256
7257     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7258         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7259                pps_id, pps->sps_id,
7260                pps->cabac ? "CABAC" : "CAVLC",
7261                pps->slice_group_count,
7262                pps->ref_count[0], pps->ref_count[1],
7263                pps->weighted_pred ? "weighted" : "",
7264                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7265                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7266                pps->constrained_intra_pred ? "CONSTR" : "",
7267                pps->redundant_pic_cnt_present ? "REDU" : "",
7268                pps->transform_8x8_mode ? "8x8DCT" : ""
7269                );
7270     }
7271
7272     av_free(h->pps_buffers[pps_id]);
7273     h->pps_buffers[pps_id]= pps;
7274     return 0;
7275 fail:
7276     av_free(pps);
7277     return -1;
7278 }
7279
7280 /**
7281  * Call decode_slice() for each context.
7282  *
7283  * @param h h264 master context
7284  * @param context_count number of contexts to execute
7285  */
7286 static void execute_decode_slices(H264Context *h, int context_count){
7287     MpegEncContext * const s = &h->s;
7288     AVCodecContext * const avctx= s->avctx;
7289     H264Context *hx;
7290     int i;
7291
7292     if(context_count == 1) {
7293         decode_slice(avctx, &h);
7294     } else {
7295         for(i = 1; i < context_count; i++) {
7296             hx = h->thread_context[i];
7297             hx->s.error_recognition = avctx->error_recognition;
7298             hx->s.error_count = 0;
7299         }
7300
7301         avctx->execute(avctx, (void *)decode_slice,
7302                        (void **)h->thread_context, NULL, context_count, sizeof(void*));
7303
7304         /* pull back stuff from slices to master context */
7305         hx = h->thread_context[context_count - 1];
7306         s->mb_x = hx->s.mb_x;
7307         s->mb_y = hx->s.mb_y;
7308         s->dropable = hx->s.dropable;
7309         s->picture_structure = hx->s.picture_structure;
7310         for(i = 1; i < context_count; i++)
7311             h->s.error_count += h->thread_context[i]->s.error_count;
7312     }
7313 }
7314
7315
7316 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7317     MpegEncContext * const s = &h->s;
7318     AVCodecContext * const avctx= s->avctx;
7319     int buf_index=0;
7320     H264Context *hx; ///< thread context
7321     int context_count = 0;
7322
7323     h->max_contexts = avctx->thread_count;
7324 #if 0
7325     int i;
7326     for(i=0; i<50; i++){
7327         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7328     }
7329 #endif
7330     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7331         h->current_slice = 0;
7332         if (!s->first_field)
7333             s->current_picture_ptr= NULL;
7334     }
7335
7336     for(;;){
7337         int consumed;
7338         int dst_length;
7339         int bit_length;
7340         const uint8_t *ptr;
7341         int i, nalsize = 0;
7342         int err;
7343
7344         if(h->is_avc) {
7345             if(buf_index >= buf_size) break;
7346             nalsize = 0;
7347             for(i = 0; i < h->nal_length_size; i++)
7348                 nalsize = (nalsize << 8) | buf[buf_index++];
7349             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7350                 if(nalsize == 1){
7351                     buf_index++;
7352                     continue;
7353                 }else{
7354                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7355                     break;
7356                 }
7357             }
7358         } else {
7359             // start code prefix search
7360             for(; buf_index + 3 < buf_size; buf_index++){
7361                 // This should always succeed in the first iteration.
7362                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7363                     break;
7364             }
7365
7366             if(buf_index+3 >= buf_size) break;
7367
7368             buf_index+=3;
7369         }
7370
7371         hx = h->thread_context[context_count];
7372
7373         ptr= decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7374         if (ptr==NULL || dst_length < 0){
7375             return -1;
7376         }
7377         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7378             dst_length--;
7379         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
7380
7381         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7382             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7383         }
7384
7385         if (h->is_avc && (nalsize != consumed)){
7386             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7387             consumed= nalsize;
7388         }
7389
7390         buf_index += consumed;
7391
7392         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7393            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7394             continue;
7395
7396       again:
7397         err = 0;
7398         switch(hx->nal_unit_type){
7399         case NAL_IDR_SLICE:
7400             if (h->nal_unit_type != NAL_IDR_SLICE) {
7401                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7402                 return -1;
7403             }
7404             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7405         case NAL_SLICE:
7406             init_get_bits(&hx->s.gb, ptr, bit_length);
7407             hx->intra_gb_ptr=
7408             hx->inter_gb_ptr= &hx->s.gb;
7409             hx->s.data_partitioning = 0;
7410
7411             if((err = decode_slice_header(hx, h)))
7412                break;
7413
7414             s->current_picture_ptr->key_frame|= (hx->nal_unit_type == NAL_IDR_SLICE);
7415             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7416                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7417                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7418                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7419                && avctx->skip_frame < AVDISCARD_ALL)
7420                 context_count++;
7421             break;
7422         case NAL_DPA:
7423             init_get_bits(&hx->s.gb, ptr, bit_length);
7424             hx->intra_gb_ptr=
7425             hx->inter_gb_ptr= NULL;
7426             hx->s.data_partitioning = 1;
7427
7428             err = decode_slice_header(hx, h);
7429             break;
7430         case NAL_DPB:
7431             init_get_bits(&hx->intra_gb, ptr, bit_length);
7432             hx->intra_gb_ptr= &hx->intra_gb;
7433             break;
7434         case NAL_DPC:
7435             init_get_bits(&hx->inter_gb, ptr, bit_length);
7436             hx->inter_gb_ptr= &hx->inter_gb;
7437
7438             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7439                && s->context_initialized
7440                && s->hurry_up < 5
7441                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7442                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7443                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7444                && avctx->skip_frame < AVDISCARD_ALL)
7445                 context_count++;
7446             break;
7447         case NAL_SEI:
7448             init_get_bits(&s->gb, ptr, bit_length);
7449             decode_sei(h);
7450             break;
7451         case NAL_SPS:
7452             init_get_bits(&s->gb, ptr, bit_length);
7453             decode_seq_parameter_set(h);
7454
7455             if(s->flags& CODEC_FLAG_LOW_DELAY)
7456                 s->low_delay=1;
7457
7458             if(avctx->has_b_frames < 2)
7459                 avctx->has_b_frames= !s->low_delay;
7460             break;
7461         case NAL_PPS:
7462             init_get_bits(&s->gb, ptr, bit_length);
7463
7464             decode_picture_parameter_set(h, bit_length);
7465
7466             break;
7467         case NAL_AUD:
7468         case NAL_END_SEQUENCE:
7469         case NAL_END_STREAM:
7470         case NAL_FILLER_DATA:
7471         case NAL_SPS_EXT:
7472         case NAL_AUXILIARY_SLICE:
7473             break;
7474         default:
7475             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7476         }
7477
7478         if(context_count == h->max_contexts) {
7479             execute_decode_slices(h, context_count);
7480             context_count = 0;
7481         }
7482
7483         if (err < 0)
7484             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7485         else if(err == 1) {
7486             /* Slice could not be decoded in parallel mode, copy down
7487              * NAL unit stuff to context 0 and restart. Note that
7488              * rbsp_buffer is not transferred, but since we no longer
7489              * run in parallel mode this should not be an issue. */
7490             h->nal_unit_type = hx->nal_unit_type;
7491             h->nal_ref_idc   = hx->nal_ref_idc;
7492             hx = h;
7493             goto again;
7494         }
7495     }
7496     if(context_count)
7497         execute_decode_slices(h, context_count);
7498     return buf_index;
7499 }
7500
7501 /**
7502  * returns the number of bytes consumed for building the current frame
7503  */
7504 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7505         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7506         if(pos+10>buf_size) pos=buf_size; // oops ;)
7507
7508         return pos;
7509 }
7510
7511 static int decode_frame(AVCodecContext *avctx,
7512                              void *data, int *data_size,
7513                              const uint8_t *buf, int buf_size)
7514 {
7515     H264Context *h = avctx->priv_data;
7516     MpegEncContext *s = &h->s;
7517     AVFrame *pict = data;
7518     int buf_index;
7519
7520     s->flags= avctx->flags;
7521     s->flags2= avctx->flags2;
7522
7523    /* end of stream, output what is still in the buffers */
7524     if (buf_size == 0) {
7525         Picture *out;
7526         int i, out_idx;
7527
7528 //FIXME factorize this with the output code below
7529         out = h->delayed_pic[0];
7530         out_idx = 0;
7531         for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7532             if(h->delayed_pic[i]->poc < out->poc){
7533                 out = h->delayed_pic[i];
7534                 out_idx = i;
7535             }
7536
7537         for(i=out_idx; h->delayed_pic[i]; i++)
7538             h->delayed_pic[i] = h->delayed_pic[i+1];
7539
7540         if(out){
7541             *data_size = sizeof(AVFrame);
7542             *pict= *(AVFrame*)out;
7543         }
7544
7545         return 0;
7546     }
7547
7548     if(h->is_avc && !h->got_avcC) {
7549         int i, cnt, nalsize;
7550         unsigned char *p = avctx->extradata;
7551         if(avctx->extradata_size < 7) {
7552             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7553             return -1;
7554         }
7555         if(*p != 1) {
7556             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7557             return -1;
7558         }
7559         /* sps and pps in the avcC always have length coded with 2 bytes,
7560            so put a fake nal_length_size = 2 while parsing them */
7561         h->nal_length_size = 2;
7562         // Decode sps from avcC
7563         cnt = *(p+5) & 0x1f; // Number of sps
7564         p += 6;
7565         for (i = 0; i < cnt; i++) {
7566             nalsize = AV_RB16(p) + 2;
7567             if(decode_nal_units(h, p, nalsize) < 0) {
7568                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7569                 return -1;
7570             }
7571             p += nalsize;
7572         }
7573         // Decode pps from avcC
7574         cnt = *(p++); // Number of pps
7575         for (i = 0; i < cnt; i++) {
7576             nalsize = AV_RB16(p) + 2;
7577             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7578                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7579                 return -1;
7580             }
7581             p += nalsize;
7582         }
7583         // Now store right nal length size, that will be use to parse all other nals
7584         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7585         // Do not reparse avcC
7586         h->got_avcC = 1;
7587     }
7588
7589     if(!h->got_avcC && !h->is_avc && s->avctx->extradata_size){
7590         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7591             return -1;
7592         h->got_avcC = 1;
7593     }
7594
7595     buf_index=decode_nal_units(h, buf, buf_size);
7596     if(buf_index < 0)
7597         return -1;
7598
7599     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7600         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7601         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7602         return -1;
7603     }
7604
7605     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7606         Picture *out = s->current_picture_ptr;
7607         Picture *cur = s->current_picture_ptr;
7608         int i, pics, cross_idr, out_of_order, out_idx;
7609
7610         s->mb_y= 0;
7611
7612         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7613         s->current_picture_ptr->pict_type= s->pict_type;
7614
7615         if(!s->dropable) {
7616             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7617             h->prev_poc_msb= h->poc_msb;
7618             h->prev_poc_lsb= h->poc_lsb;
7619         }
7620         h->prev_frame_num_offset= h->frame_num_offset;
7621         h->prev_frame_num= h->frame_num;
7622
7623         /*
7624          * FIXME: Error handling code does not seem to support interlaced
7625          * when slices span multiple rows
7626          * The ff_er_add_slice calls don't work right for bottom
7627          * fields; they cause massive erroneous error concealing
7628          * Error marking covers both fields (top and bottom).
7629          * This causes a mismatched s->error_count
7630          * and a bad error table. Further, the error count goes to
7631          * INT_MAX when called for bottom field, because mb_y is
7632          * past end by one (callers fault) and resync_mb_y != 0
7633          * causes problems for the first MB line, too.
7634          */
7635         if (!FIELD_PICTURE)
7636             ff_er_frame_end(s);
7637
7638         MPV_frame_end(s);
7639
7640         if (cur->field_poc[0]==INT_MAX || cur->field_poc[1]==INT_MAX) {
7641             /* Wait for second field. */
7642             *data_size = 0;
7643
7644         } else {
7645             cur->repeat_pict = 0;
7646
7647             /* Signal interlacing information externally. */
7648             /* Prioritize picture timing SEI information over used decoding process if it exists. */
7649             if(h->sps.pic_struct_present_flag){
7650                 switch (h->sei_pic_struct)
7651                 {
7652                 case SEI_PIC_STRUCT_FRAME:
7653                     cur->interlaced_frame = 0;
7654                     break;
7655                 case SEI_PIC_STRUCT_TOP_FIELD:
7656                 case SEI_PIC_STRUCT_BOTTOM_FIELD:
7657                 case SEI_PIC_STRUCT_TOP_BOTTOM:
7658                 case SEI_PIC_STRUCT_BOTTOM_TOP:
7659                     cur->interlaced_frame = 1;
7660                     break;
7661                 case SEI_PIC_STRUCT_TOP_BOTTOM_TOP:
7662                 case SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM:
7663                     // Signal the possibility of telecined film externally (pic_struct 5,6)
7664                     // From these hints, let the applications decide if they apply deinterlacing.
7665                     cur->repeat_pict = 1;
7666                     cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7667                     break;
7668                 case SEI_PIC_STRUCT_FRAME_DOUBLING:
7669                     // Force progressive here, as doubling interlaced frame is a bad idea.
7670                     cur->interlaced_frame = 0;
7671                     cur->repeat_pict = 2;
7672                     break;
7673                 case SEI_PIC_STRUCT_FRAME_TRIPLING:
7674                     cur->interlaced_frame = 0;
7675                     cur->repeat_pict = 4;
7676                     break;
7677                 }
7678             }else{
7679                 /* Derive interlacing flag from used decoding process. */
7680                 cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7681             }
7682
7683             if (cur->field_poc[0] != cur->field_poc[1]){
7684                 /* Derive top_field_first from field pocs. */
7685                 cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7686             }else{
7687                 if(cur->interlaced_frame || h->sps.pic_struct_present_flag){
7688                     /* Use picture timing SEI information. Even if it is a information of a past frame, better than nothing. */
7689                     if(h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM
7690                       || h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM_TOP)
7691                         cur->top_field_first = 1;
7692                     else
7693                         cur->top_field_first = 0;
7694                 }else{
7695                     /* Most likely progressive */
7696                     cur->top_field_first = 0;
7697                 }
7698             }
7699
7700         //FIXME do something with unavailable reference frames
7701
7702             /* Sort B-frames into display order */
7703
7704             if(h->sps.bitstream_restriction_flag
7705                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7706                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7707                 s->low_delay = 0;
7708             }
7709
7710             if(   s->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT
7711                && !h->sps.bitstream_restriction_flag){
7712                 s->avctx->has_b_frames= MAX_DELAYED_PIC_COUNT;
7713                 s->low_delay= 0;
7714             }
7715
7716             pics = 0;
7717             while(h->delayed_pic[pics]) pics++;
7718
7719             assert(pics <= MAX_DELAYED_PIC_COUNT);
7720
7721             h->delayed_pic[pics++] = cur;
7722             if(cur->reference == 0)
7723                 cur->reference = DELAYED_PIC_REF;
7724
7725             out = h->delayed_pic[0];
7726             out_idx = 0;
7727             for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7728                 if(h->delayed_pic[i]->poc < out->poc){
7729                     out = h->delayed_pic[i];
7730                     out_idx = i;
7731                 }
7732             cross_idr = !h->delayed_pic[0]->poc || !!h->delayed_pic[i] || h->delayed_pic[0]->key_frame;
7733
7734             out_of_order = !cross_idr && out->poc < h->outputed_poc;
7735
7736             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7737                 { }
7738             else if((out_of_order && pics-1 == s->avctx->has_b_frames && s->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT)
7739                || (s->low_delay &&
7740                 ((!cross_idr && out->poc > h->outputed_poc + 2)
7741                  || cur->pict_type == FF_B_TYPE)))
7742             {
7743                 s->low_delay = 0;
7744                 s->avctx->has_b_frames++;
7745             }
7746
7747             if(out_of_order || pics > s->avctx->has_b_frames){
7748                 out->reference &= ~DELAYED_PIC_REF;
7749                 for(i=out_idx; h->delayed_pic[i]; i++)
7750                     h->delayed_pic[i] = h->delayed_pic[i+1];
7751             }
7752             if(!out_of_order && pics > s->avctx->has_b_frames){
7753                 *data_size = sizeof(AVFrame);
7754
7755                 h->outputed_poc = out->poc;
7756                 *pict= *(AVFrame*)out;
7757             }else{
7758                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7759             }
7760         }
7761     }
7762
7763     assert(pict->data[0] || !*data_size);
7764     ff_print_debug_info(s, pict);
7765 //printf("out %d\n", (int)pict->data[0]);
7766 #if 0 //?
7767
7768     /* Return the Picture timestamp as the frame number */
7769     /* we subtract 1 because it is added on utils.c     */
7770     avctx->frame_number = s->picture_number - 1;
7771 #endif
7772     return get_consumed_bytes(s, buf_index, buf_size);
7773 }
7774 #if 0
7775 static inline void fill_mb_avail(H264Context *h){
7776     MpegEncContext * const s = &h->s;
7777     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7778
7779     if(s->mb_y){
7780         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7781         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7782         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7783     }else{
7784         h->mb_avail[0]=
7785         h->mb_avail[1]=
7786         h->mb_avail[2]= 0;
7787     }
7788     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7789     h->mb_avail[4]= 1; //FIXME move out
7790     h->mb_avail[5]= 0; //FIXME move out
7791 }
7792 #endif
7793
7794 #ifdef TEST
7795 #undef printf
7796 #undef random
7797 #define COUNT 8000
7798 #define SIZE (COUNT*40)
7799 int main(void){
7800     int i;
7801     uint8_t temp[SIZE];
7802     PutBitContext pb;
7803     GetBitContext gb;
7804 //    int int_temp[10000];
7805     DSPContext dsp;
7806     AVCodecContext avctx;
7807
7808     dsputil_init(&dsp, &avctx);
7809
7810     init_put_bits(&pb, temp, SIZE);
7811     printf("testing unsigned exp golomb\n");
7812     for(i=0; i<COUNT; i++){
7813         START_TIMER
7814         set_ue_golomb(&pb, i);
7815         STOP_TIMER("set_ue_golomb");
7816     }
7817     flush_put_bits(&pb);
7818
7819     init_get_bits(&gb, temp, 8*SIZE);
7820     for(i=0; i<COUNT; i++){
7821         int j, s;
7822
7823         s= show_bits(&gb, 24);
7824
7825         START_TIMER
7826         j= get_ue_golomb(&gb);
7827         if(j != i){
7828             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7829 //            return -1;
7830         }
7831         STOP_TIMER("get_ue_golomb");
7832     }
7833
7834
7835     init_put_bits(&pb, temp, SIZE);
7836     printf("testing signed exp golomb\n");
7837     for(i=0; i<COUNT; i++){
7838         START_TIMER
7839         set_se_golomb(&pb, i - COUNT/2);
7840         STOP_TIMER("set_se_golomb");
7841     }
7842     flush_put_bits(&pb);
7843
7844     init_get_bits(&gb, temp, 8*SIZE);
7845     for(i=0; i<COUNT; i++){
7846         int j, s;
7847
7848         s= show_bits(&gb, 24);
7849
7850         START_TIMER
7851         j= get_se_golomb(&gb);
7852         if(j != i - COUNT/2){
7853             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7854 //            return -1;
7855         }
7856         STOP_TIMER("get_se_golomb");
7857     }
7858
7859 #if 0
7860     printf("testing 4x4 (I)DCT\n");
7861
7862     DCTELEM block[16];
7863     uint8_t src[16], ref[16];
7864     uint64_t error= 0, max_error=0;
7865
7866     for(i=0; i<COUNT; i++){
7867         int j;
7868 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
7869         for(j=0; j<16; j++){
7870             ref[j]= random()%255;
7871             src[j]= random()%255;
7872         }
7873
7874         h264_diff_dct_c(block, src, ref, 4);
7875
7876         //normalize
7877         for(j=0; j<16; j++){
7878 //            printf("%d ", block[j]);
7879             block[j]= block[j]*4;
7880             if(j&1) block[j]= (block[j]*4 + 2)/5;
7881             if(j&4) block[j]= (block[j]*4 + 2)/5;
7882         }
7883 //        printf("\n");
7884
7885         s->dsp.h264_idct_add(ref, block, 4);
7886 /*        for(j=0; j<16; j++){
7887             printf("%d ", ref[j]);
7888         }
7889         printf("\n");*/
7890
7891         for(j=0; j<16; j++){
7892             int diff= FFABS(src[j] - ref[j]);
7893
7894             error+= diff*diff;
7895             max_error= FFMAX(max_error, diff);
7896         }
7897     }
7898     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
7899     printf("testing quantizer\n");
7900     for(qp=0; qp<52; qp++){
7901         for(i=0; i<16; i++)
7902             src1_block[i]= src2_block[i]= random()%255;
7903
7904     }
7905     printf("Testing NAL layer\n");
7906
7907     uint8_t bitstream[COUNT];
7908     uint8_t nal[COUNT*2];
7909     H264Context h;
7910     memset(&h, 0, sizeof(H264Context));
7911
7912     for(i=0; i<COUNT; i++){
7913         int zeros= i;
7914         int nal_length;
7915         int consumed;
7916         int out_length;
7917         uint8_t *out;
7918         int j;
7919
7920         for(j=0; j<COUNT; j++){
7921             bitstream[j]= (random() % 255) + 1;
7922         }
7923
7924         for(j=0; j<zeros; j++){
7925             int pos= random() % COUNT;
7926             while(bitstream[pos] == 0){
7927                 pos++;
7928                 pos %= COUNT;
7929             }
7930             bitstream[pos]=0;
7931         }
7932
7933         START_TIMER
7934
7935         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
7936         if(nal_length<0){
7937             printf("encoding failed\n");
7938             return -1;
7939         }
7940
7941         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
7942
7943         STOP_TIMER("NAL")
7944
7945         if(out_length != COUNT){
7946             printf("incorrect length %d %d\n", out_length, COUNT);
7947             return -1;
7948         }
7949
7950         if(consumed != nal_length){
7951             printf("incorrect consumed length %d %d\n", nal_length, consumed);
7952             return -1;
7953         }
7954
7955         if(memcmp(bitstream, out, COUNT)){
7956             printf("mismatch\n");
7957             return -1;
7958         }
7959     }
7960 #endif
7961
7962     printf("Testing RBSP\n");
7963
7964
7965     return 0;
7966 }
7967 #endif /* TEST */
7968
7969
7970 static av_cold int decode_end(AVCodecContext *avctx)
7971 {
7972     H264Context *h = avctx->priv_data;
7973     MpegEncContext *s = &h->s;
7974     int i;
7975
7976     av_freep(&h->rbsp_buffer[0]);
7977     av_freep(&h->rbsp_buffer[1]);
7978     free_tables(h); //FIXME cleanup init stuff perhaps
7979
7980     for(i = 0; i < MAX_SPS_COUNT; i++)
7981         av_freep(h->sps_buffers + i);
7982
7983     for(i = 0; i < MAX_PPS_COUNT; i++)
7984         av_freep(h->pps_buffers + i);
7985
7986     MPV_common_end(s);
7987
7988 //    memset(h, 0, sizeof(H264Context));
7989
7990     return 0;
7991 }
7992
7993
7994 AVCodec h264_decoder = {
7995     "h264",
7996     CODEC_TYPE_VIDEO,
7997     CODEC_ID_H264,
7998     sizeof(H264Context),
7999     decode_init,
8000     NULL,
8001     decode_end,
8002     decode_frame,
8003     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_DELAY,
8004     .flush= flush_dpb,
8005     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
8006 };
8007
8008 #include "svq3.c"