libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264.h"
  32 #include "h264data.h"
  33 #include "h264_parser.h"
  34 #include "golomb.h"
  35 #include "mathops.h"
  36 #include "rectangle.h"
  37 #include "vdpau_internal.h"
  38
  39 #include "cabac.h"
  40 #if ARCH_X86
  41 #include "x86/h264_i386.h"
  42 #endif
  43
  44 //#undef NDEBUG
  45 #include <assert.h>
  46
  47 /**
  48  * Value of Picture.reference when Picture is not a reference picture, but
  49  * is held for delayed output.
  50  */
  51 #define DELAYED_PIC_REF 4
  52
  53 static VLC coeff_token_vlc[4];
  54 static VLC_TYPE coeff_token_vlc_tables[520+332+280+256][2];
  55 static const int coeff_token_vlc_tables_size[4]={520,332,280,256};
  56
  57 static VLC chroma_dc_coeff_token_vlc;
  58 static VLC_TYPE chroma_dc_coeff_token_vlc_table[256][2];
  59 static const int chroma_dc_coeff_token_vlc_table_size = 256;
  60
  61 static VLC total_zeros_vlc[15];
  62 static VLC_TYPE total_zeros_vlc_tables[15][512][2];
  63 static const int total_zeros_vlc_tables_size = 512;
  64
  65 static VLC chroma_dc_total_zeros_vlc[3];
  66 static VLC_TYPE chroma_dc_total_zeros_vlc_tables[3][8][2];
  67 static const int chroma_dc_total_zeros_vlc_tables_size = 8;
  68
  69 static VLC run_vlc[6];
  70 static VLC_TYPE run_vlc_tables[6][8][2];
  71 static const int run_vlc_tables_size = 8;
  72
  73 static VLC run7_vlc;
  74 static VLC_TYPE run7_vlc_table[96][2];
  75 static const int run7_vlc_table_size = 96;
  76
  77 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  78 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  79 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  80 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  81 static Picture * remove_long(H264Context *h, int i, int ref_mask);
  82
  83 static av_always_inline uint32_t pack16to32(int a, int b){
  84 #ifdef WORDS_BIGENDIAN
  85    return (b&0xFFFF) + (a<<16);
  86 #else
  87    return (a&0xFFFF) + (b<<16);
  88 #endif
  89 }
  90
  91 static const uint8_t rem6[52]={
  92 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  93 };
  94
  95 static const uint8_t div6[52]={
  96 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  97 };
  98
  99 static const int left_block_options[4][8]={
 100     {0,1,2,3,7,10,8,11},
 101     {2,2,3,3,8,11,8,11},
 102     {0,0,1,1,7,10,7,10},
 103     {0,2,0,2,7,10,7,10}
 104 };
 105
 106 #define LEVEL_TAB_BITS 8
 107 static int8_t cavlc_level_tab[7][1<<LEVEL_TAB_BITS][2];
 108
 109 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 110     MpegEncContext * const s = &h->s;
 111     const int mb_xy= h->mb_xy;
 112     int topleft_xy, top_xy, topright_xy, left_xy[2];
 113     int topleft_type, top_type, topright_type, left_type[2];
 114     const int * left_block;
 115     int topleft_partition= -1;
 116     int i;
 117
 118     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
 119
 120     //FIXME deblocking could skip the intra and nnz parts.
 121     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
 122         return;
 123
 124     /* Wow, what a mess, why didn't they simplify the interlacing & intra
 125      * stuff, I can't imagine that these complex rules are worth it. */
 126
 127     topleft_xy = top_xy - 1;
 128     topright_xy= top_xy + 1;
 129     left_xy[1] = left_xy[0] = mb_xy-1;
 130     left_block = left_block_options[0];
 131     if(FRAME_MBAFF){
 132         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 133         const int top_pair_xy      = pair_xy     - s->mb_stride;
 134         const int topleft_pair_xy  = top_pair_xy - 1;
 135         const int topright_pair_xy = top_pair_xy + 1;
 136         const int topleft_mb_field_flag  = IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 137         const int top_mb_field_flag      = IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 138         const int topright_mb_field_flag = IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 139         const int left_mb_field_flag     = IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 140         const int curr_mb_field_flag     = IS_INTERLACED(mb_type);
 141         const int bottom = (s->mb_y & 1);
 142         tprintf(s->avctx, "fill_caches: curr_mb_field_flag:%d, left_mb_field_flag:%d, topleft_mb_field_flag:%d, top_mb_field_flag:%d, topright_mb_field_flag:%d\n", curr_mb_field_flag, left_mb_field_flag, topleft_mb_field_flag, top_mb_field_flag, topright_mb_field_flag);
 143
 144         if (curr_mb_field_flag && (bottom || top_mb_field_flag)){
 145             top_xy -= s->mb_stride;
 146         }
 147         if (curr_mb_field_flag && (bottom || topleft_mb_field_flag)){
 148             topleft_xy -= s->mb_stride;
 149         } else if(bottom && !curr_mb_field_flag && left_mb_field_flag) {
 150             topleft_xy += s->mb_stride;
 151             // take top left mv from the middle of the mb, as opposed to all other modes which use the bottom right partition
 152             topleft_partition = 0;
 153         }
 154         if (curr_mb_field_flag && (bottom || topright_mb_field_flag)){
 155             topright_xy -= s->mb_stride;
 156         }
 157         if (left_mb_field_flag != curr_mb_field_flag) {
 158             left_xy[1] = left_xy[0] = pair_xy - 1;
 159             if (curr_mb_field_flag) {
 160                 left_xy[1] += s->mb_stride;
 161                 left_block = left_block_options[3];
 162             } else {
 163                 left_block= left_block_options[2 - bottom];
 164             }
 165         }
 166     }
 167
 168     h->top_mb_xy = top_xy;
 169     h->left_mb_xy[0] = left_xy[0];
 170     h->left_mb_xy[1] = left_xy[1];
 171     if(for_deblock){
 172         topleft_type = 0;
 173         topright_type = 0;
 174         top_type     = h->slice_table[top_xy     ] < 0xFFFF ? s->current_picture.mb_type[top_xy]     : 0;
 175         left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0;
 176         left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0;
 177
 178         if(MB_MBAFF && !IS_INTRA(mb_type)){
 179             int list;
 180             for(list=0; list<h->list_count; list++){
 181                 //These values where changed for ease of performing MC, we need to change them back
 182                 //FIXME maybe we can make MC and loop filter use the same values or prevent
 183                 //the MC code from changing ref_cache and rather use a temporary array.
 184                 if(USES_LIST(mb_type,list)){
 185                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 186                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 187                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 188                     ref += h->b8_stride;
 189                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 190                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 191                 }
 192             }
 193         }
 194     }else{
 195         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 196         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 197         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 198         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 199         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 200
 201     if(IS_INTRA(mb_type)){
 202         int type_mask= h->pps.constrained_intra_pred ? IS_INTRA(-1) : -1;
 203         h->topleft_samples_available=
 204         h->top_samples_available=
 205         h->left_samples_available= 0xFFFF;
 206         h->topright_samples_available= 0xEEEA;
 207
 208         if(!(top_type & type_mask)){
 209             h->topleft_samples_available= 0xB3FF;
 210             h->top_samples_available= 0x33FF;
 211             h->topright_samples_available= 0x26EA;
 212         }
 213         if(IS_INTERLACED(mb_type) != IS_INTERLACED(left_type[0])){
 214             if(IS_INTERLACED(mb_type)){
 215                 if(!(left_type[0] & type_mask)){
 216                     h->topleft_samples_available&= 0xDFFF;
 217                     h->left_samples_available&= 0x5FFF;
 218                 }
 219                 if(!(left_type[1] & type_mask)){
 220                     h->topleft_samples_available&= 0xFF5F;
 221                     h->left_samples_available&= 0xFF5F;
 222                 }
 223             }else{
 224                 int left_typei = h->slice_table[left_xy[0] + s->mb_stride ] == h->slice_num
 225                                 ? s->current_picture.mb_type[left_xy[0] + s->mb_stride] : 0;
 226                 assert(left_xy[0] == left_xy[1]);
 227                 if(!((left_typei & type_mask) && (left_type[0] & type_mask))){
 228                     h->topleft_samples_available&= 0xDF5F;
 229                     h->left_samples_available&= 0x5F5F;
 230                 }
 231             }
 232         }else{
 233             if(!(left_type[0] & type_mask)){
 234                 h->topleft_samples_available&= 0xDF5F;
 235                 h->left_samples_available&= 0x5F5F;
 236             }
 237         }
 238
 239         if(!(topleft_type & type_mask))
 240             h->topleft_samples_available&= 0x7FFF;
 241
 242         if(!(topright_type & type_mask))
 243             h->topright_samples_available&= 0xFBFF;
 244
 245         if(IS_INTRA4x4(mb_type)){
 246             if(IS_INTRA4x4(top_type)){
 247                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 248                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 249                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 250                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 251             }else{
 252                 int pred;
 253                 if(!(top_type & type_mask))
 254                     pred= -1;
 255                 else{
 256                     pred= 2;
 257                 }
 258                 h->intra4x4_pred_mode_cache[4+8*0]=
 259                 h->intra4x4_pred_mode_cache[5+8*0]=
 260                 h->intra4x4_pred_mode_cache[6+8*0]=
 261                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 262             }
 263             for(i=0; i<2; i++){
 264                 if(IS_INTRA4x4(left_type[i])){
 265                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 266                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 267                 }else{
 268                     int pred;
 269                     if(!(left_type[i] & type_mask))
 270                         pred= -1;
 271                     else{
 272                         pred= 2;
 273                     }
 274                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 275                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 276                 }
 277             }
 278         }
 279     }
 280     }
 281
 282
 283 /*
 284 0 . T T. T T T T
 285 1 L . .L . . . .
 286 2 L . .L . . . .
 287 3 . T TL . . . .
 288 4 L . .L . . . .
 289 5 L . .. . . . .
 290 */
 291 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
 292     if(top_type){
 293         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 294         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 295         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 296         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 297
 298         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 299         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 300
 301         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 302         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 303
 304     }else{
 305         h->non_zero_count_cache[4+8*0]=
 306         h->non_zero_count_cache[5+8*0]=
 307         h->non_zero_count_cache[6+8*0]=
 308         h->non_zero_count_cache[7+8*0]=
 309
 310         h->non_zero_count_cache[1+8*0]=
 311         h->non_zero_count_cache[2+8*0]=
 312
 313         h->non_zero_count_cache[1+8*3]=
 314         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 315
 316     }
 317
 318     for (i=0; i<2; i++) {
 319         if(left_type[i]){
 320             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 321             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 322             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 323             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 324         }else{
 325             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 326             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 327             h->non_zero_count_cache[0+8*1 +   8*i]=
 328             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 329         }
 330     }
 331
 332     if( h->pps.cabac ) {
 333         // top_cbp
 334         if(top_type) {
 335             h->top_cbp = h->cbp_table[top_xy];
 336         } else if(IS_INTRA(mb_type)) {
 337             h->top_cbp = 0x1C0;
 338         } else {
 339             h->top_cbp = 0;
 340         }
 341         // left_cbp
 342         if (left_type[0]) {
 343             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 344         } else if(IS_INTRA(mb_type)) {
 345             h->left_cbp = 0x1C0;
 346         } else {
 347             h->left_cbp = 0;
 348         }
 349         if (left_type[0]) {
 350             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 351         }
 352         if (left_type[1]) {
 353             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 354         }
 355     }
 356
 357 #if 1
 358     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 359         int list;
 360         for(list=0; list<h->list_count; list++){
 361             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 362                 /*if(!h->mv_cache_clean[list]){
 363                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 364                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 365                     h->mv_cache_clean[list]= 1;
 366                 }*/
 367                 continue;
 368             }
 369             h->mv_cache_clean[list]= 0;
 370
 371             if(USES_LIST(top_type, list)){
 372                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 373                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 374                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 375                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 376                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 377                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 378                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 379                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 380                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 381                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 382             }else{
 383                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 384                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 385                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 386                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 387                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 388             }
 389
 390             for(i=0; i<2; i++){
 391                 int cache_idx = scan8[0] - 1 + i*2*8;
 392                 if(USES_LIST(left_type[i], list)){
 393                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 394                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 395                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 396                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 397                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 398                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 399                 }else{
 400                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 401                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 402                     h->ref_cache[list][cache_idx  ]=
 403                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 404                 }
 405             }
 406
 407             if(for_deblock || ((IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred) && !FRAME_MBAFF))
 408                 continue;
 409
 410             if(USES_LIST(topleft_type, list)){
 411                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 412                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 413                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 414                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 415             }else{
 416                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 417                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 418             }
 419
 420             if(USES_LIST(topright_type, list)){
 421                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 422                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 423                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 424                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 425             }else{
 426                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 427                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 428             }
 429
 430             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 431                 continue;
 432
 433             h->ref_cache[list][scan8[5 ]+1] =
 434             h->ref_cache[list][scan8[7 ]+1] =
 435             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 436             h->ref_cache[list][scan8[4 ]] =
 437             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 438             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 439             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 440             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 441             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 442             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 443
 444             if( h->pps.cabac ) {
 445                 /* XXX beurk, Load mvd */
 446                 if(USES_LIST(top_type, list)){
 447                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 448                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 449                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 450                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 451                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 452                 }else{
 453                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 454                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 455                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 456                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 457                 }
 458                 if(USES_LIST(left_type[0], list)){
 459                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 460                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 461                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 462                 }else{
 463                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 464                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 465                 }
 466                 if(USES_LIST(left_type[1], list)){
 467                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 468                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 469                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 470                 }else{
 471                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 472                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 473                 }
 474                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 475                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 476                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 477                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 478                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 479
 480                 if(h->slice_type_nos == FF_B_TYPE){
 481                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 482
 483                     if(IS_DIRECT(top_type)){
 484                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 485                     }else if(IS_8X8(top_type)){
 486                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 487                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 488                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 489                     }else{
 490                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 491                     }
 492
 493                     if(IS_DIRECT(left_type[0]))
 494                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 495                     else if(IS_8X8(left_type[0]))
 496                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 497                     else
 498                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 499
 500                     if(IS_DIRECT(left_type[1]))
 501                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 502                     else if(IS_8X8(left_type[1]))
 503                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 504                     else
 505                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 506                 }
 507             }
 508
 509             if(FRAME_MBAFF){
 510 #define MAP_MVS\
 511                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 512                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 513                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 514                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 515                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 516                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 517                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 518                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 519                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 520                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 521                 if(MB_FIELD){
 522 #define MAP_F2F(idx, mb_type)\
 523                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 524                         h->ref_cache[list][idx] <<= 1;\
 525                         h->mv_cache[list][idx][1] /= 2;\
 526                         h->mvd_cache[list][idx][1] /= 2;\
 527                     }
 528                     MAP_MVS
 529 #undef MAP_F2F
 530                 }else{
 531 #define MAP_F2F(idx, mb_type)\
 532                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 533                         h->ref_cache[list][idx] >>= 1;\
 534                         h->mv_cache[list][idx][1] <<= 1;\
 535                         h->mvd_cache[list][idx][1] <<= 1;\
 536                     }
 537                     MAP_MVS
 538 #undef MAP_F2F
 539                 }
 540             }
 541         }
 542     }
 543 #endif
 544
 545     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 546 }
 547
 548 static inline void write_back_intra_pred_mode(H264Context *h){
 549     const int mb_xy= h->mb_xy;
 550
 551     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 552     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 553     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 554     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 555     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 556     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 557     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 558 }
 559
 560 /**
 561  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 562  */
 563 static inline int check_intra4x4_pred_mode(H264Context *h){
 564     MpegEncContext * const s = &h->s;
 565     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 566     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 567     int i;
 568
 569     if(!(h->top_samples_available&0x8000)){
 570         for(i=0; i<4; i++){
 571             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 572             if(status<0){
 573                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 574                 return -1;
 575             } else if(status){
 576                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 577             }
 578         }
 579     }
 580
 581     if((h->left_samples_available&0x8888)!=0x8888){
 582         static const int mask[4]={0x8000,0x2000,0x80,0x20};
 583         for(i=0; i<4; i++){
 584             if(!(h->left_samples_available&mask[i])){
 585                 int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 586                 if(status<0){
 587                     av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 588                     return -1;
 589                 } else if(status){
 590                     h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 591                 }
 592             }
 593         }
 594     }
 595
 596     return 0;
 597 } //FIXME cleanup like next
 598
 599 /**
 600  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 601  */
 602 static inline int check_intra_pred_mode(H264Context *h, int mode){
 603     MpegEncContext * const s = &h->s;
 604     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 605     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 606
 607     if(mode > 6U) {
 608         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 609         return -1;
 610     }
 611
 612     if(!(h->top_samples_available&0x8000)){
 613         mode= top[ mode ];
 614         if(mode<0){
 615             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 616             return -1;
 617         }
 618     }
 619
 620     if((h->left_samples_available&0x8080) != 0x8080){
 621         mode= left[ mode ];
 622         if(h->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred
 623             mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(h->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8);
 624         }
 625         if(mode<0){
 626             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 627             return -1;
 628         }
 629     }
 630
 631     return mode;
 632 }
 633
 634 /**
 635  * gets the predicted intra4x4 prediction mode.
 636  */
 637 static inline int pred_intra_mode(H264Context *h, int n){
 638     const int index8= scan8[n];
 639     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 640     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 641     const int min= FFMIN(left, top);
 642
 643     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 644
 645     if(min<0) return DC_PRED;
 646     else      return min;
 647 }
 648
 649 static inline void write_back_non_zero_count(H264Context *h){
 650     const int mb_xy= h->mb_xy;
 651
 652     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 653     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 654     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 655     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 656     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 657     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 658     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 659
 660     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 661     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 662     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 663
 664     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 665     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 666     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 667 }
 668
 669 /**
 670  * gets the predicted number of non-zero coefficients.
 671  * @param n block index
 672  */
 673 static inline int pred_non_zero_count(H264Context *h, int n){
 674     const int index8= scan8[n];
 675     const int left= h->non_zero_count_cache[index8 - 1];
 676     const int top = h->non_zero_count_cache[index8 - 8];
 677     int i= left + top;
 678
 679     if(i<64) i= (i+1)>>1;
 680
 681     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 682
 683     return i&31;
 684 }
 685
 686 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 687     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 688     MpegEncContext *s = &h->s;
 689
 690     /* there is no consistent mapping of mvs to neighboring locations that will
 691      * make mbaff happy, so we can't move all this logic to fill_caches */
 692     if(FRAME_MBAFF){
 693         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 694         const int16_t *mv;
 695         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 696         *C = h->mv_cache[list][scan8[0]-2];
 697
 698         if(!MB_FIELD
 699            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 700             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 701             if(IS_INTERLACED(mb_types[topright_xy])){
 702 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 703                 const int x4 = X4, y4 = Y4;\
 704                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 705                 if(!USES_LIST(mb_type,list))\
 706                     return LIST_NOT_USED;\
 707                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 708                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 709                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 710                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 711
 712                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 713             }
 714         }
 715         if(topright_ref == PART_NOT_AVAILABLE
 716            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 717            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 718             if(!MB_FIELD
 719                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 720                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 721             }
 722             if(MB_FIELD
 723                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 724                && i >= scan8[0]+8){
 725                 // left shift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's OK.
 726                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 727             }
 728         }
 729 #undef SET_DIAG_MV
 730     }
 731
 732     if(topright_ref != PART_NOT_AVAILABLE){
 733         *C= h->mv_cache[list][ i - 8 + part_width ];
 734         return topright_ref;
 735     }else{
 736         tprintf(s->avctx, "topright MV not available\n");
 737
 738         *C= h->mv_cache[list][ i - 8 - 1 ];
 739         return h->ref_cache[list][ i - 8 - 1 ];
 740     }
 741 }
 742
 743 /**
 744  * gets the predicted MV.
 745  * @param n the block index
 746  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 747  * @param mx the x component of the predicted motion vector
 748  * @param my the y component of the predicted motion vector
 749  */
 750 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 751     const int index8= scan8[n];
 752     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 753     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 754     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 755     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 756     const int16_t * C;
 757     int diagonal_ref, match_count;
 758
 759     assert(part_width==1 || part_width==2 || part_width==4);
 760
 761 /* mv_cache
 762   B . . A T T T T
 763   U . . L . . , .
 764   U . . L . . . .
 765   U . . L . . , .
 766   . . . L . . . .
 767 */
 768
 769     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 770     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 771     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 772     if(match_count > 1){ //most common
 773         *mx= mid_pred(A[0], B[0], C[0]);
 774         *my= mid_pred(A[1], B[1], C[1]);
 775     }else if(match_count==1){
 776         if(left_ref==ref){
 777             *mx= A[0];
 778             *my= A[1];
 779         }else if(top_ref==ref){
 780             *mx= B[0];
 781             *my= B[1];
 782         }else{
 783             *mx= C[0];
 784             *my= C[1];
 785         }
 786     }else{
 787         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 788             *mx= A[0];
 789             *my= A[1];
 790         }else{
 791             *mx= mid_pred(A[0], B[0], C[0]);
 792             *my= mid_pred(A[1], B[1], C[1]);
 793         }
 794     }
 795
 796     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 797 }
 798
 799 /**
 800  * gets the directionally predicted 16x8 MV.
 801  * @param n the block index
 802  * @param mx the x component of the predicted motion vector
 803  * @param my the y component of the predicted motion vector
 804  */
 805 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 806     if(n==0){
 807         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 808         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 809
 810         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 811
 812         if(top_ref == ref){
 813             *mx= B[0];
 814             *my= B[1];
 815             return;
 816         }
 817     }else{
 818         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 819         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 820
 821         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 822
 823         if(left_ref == ref){
 824             *mx= A[0];
 825             *my= A[1];
 826             return;
 827         }
 828     }
 829
 830     //RARE
 831     pred_motion(h, n, 4, list, ref, mx, my);
 832 }
 833
 834 /**
 835  * gets the directionally predicted 8x16 MV.
 836  * @param n the block index
 837  * @param mx the x component of the predicted motion vector
 838  * @param my the y component of the predicted motion vector
 839  */
 840 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 841     if(n==0){
 842         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 843         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 844
 845         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 846
 847         if(left_ref == ref){
 848             *mx= A[0];
 849             *my= A[1];
 850             return;
 851         }
 852     }else{
 853         const int16_t * C;
 854         int diagonal_ref;
 855
 856         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 857
 858         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 859
 860         if(diagonal_ref == ref){
 861             *mx= C[0];
 862             *my= C[1];
 863             return;
 864         }
 865     }
 866
 867     //RARE
 868     pred_motion(h, n, 2, list, ref, mx, my);
 869 }
 870
 871 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 872     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 873     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 874
 875     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 876
 877     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 878        || !( top_ref | *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ])
 879        || !(left_ref | *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ])){
 880
 881         *mx = *my = 0;
 882         return;
 883     }
 884
 885     pred_motion(h, 0, 4, 0, 0, mx, my);
 886
 887     return;
 888 }
 889
 890 static int get_scale_factor(H264Context * const h, int poc, int poc1, int i){
 891     int poc0 = h->ref_list[0][i].poc;
 892     int td = av_clip(poc1 - poc0, -128, 127);
 893     if(td == 0 || h->ref_list[0][i].long_ref){
 894         return 256;
 895     }else{
 896         int tb = av_clip(poc - poc0, -128, 127);
 897         int tx = (16384 + (FFABS(td) >> 1)) / td;
 898         return av_clip((tb*tx + 32) >> 6, -1024, 1023);
 899     }
 900 }
 901
 902 static inline void direct_dist_scale_factor(H264Context * const h){
 903     MpegEncContext * const s = &h->s;
 904     const int poc = h->s.current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
 905     const int poc1 = h->ref_list[1][0].poc;
 906     int i, field;
 907     for(field=0; field<2; field++){
 908         const int poc  = h->s.current_picture_ptr->field_poc[field];
 909         const int poc1 = h->ref_list[1][0].field_poc[field];
 910         for(i=0; i < 2*h->ref_count[0]; i++)
 911             h->dist_scale_factor_field[field][i^field] = get_scale_factor(h, poc, poc1, i+16);
 912     }
 913
 914     for(i=0; i<h->ref_count[0]; i++){
 915         h->dist_scale_factor[i] = get_scale_factor(h, poc, poc1, i);
 916     }
 917 }
 918
 919 static void fill_colmap(H264Context *h, int map[2][16+32], int list, int field, int colfield, int mbafi){
 920     MpegEncContext * const s = &h->s;
 921     Picture * const ref1 = &h->ref_list[1][0];
 922     int j, old_ref, rfield;
 923     int start= mbafi ? 16                      : 0;
 924     int end  = mbafi ? 16+2*h->ref_count[list] : h->ref_count[list];
 925     int interl= mbafi || s->picture_structure != PICT_FRAME;
 926
 927     /* bogus; fills in for missing frames */
 928     memset(map[list], 0, sizeof(map[list]));
 929
 930     for(rfield=0; rfield<2; rfield++){
 931         for(old_ref=0; old_ref<ref1->ref_count[colfield][list]; old_ref++){
 932             int poc = ref1->ref_poc[colfield][list][old_ref];
 933
 934             if     (!interl)
 935                 poc |= 3;
 936             else if( interl && (poc&3) == 3) //FIXME store all MBAFF references so this isnt needed
 937                 poc= (poc&~3) + rfield + 1;
 938
 939             for(j=start; j<end; j++){
 940                 if(4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3) == poc){
 941                     int cur_ref= mbafi ? (j-16)^field : j;
 942                     map[list][2*old_ref + (rfield^field) + 16] = cur_ref;
 943                     if(rfield == field)
 944                         map[list][old_ref] = cur_ref;
 945                     break;
 946                 }
 947             }
 948         }
 949     }
 950 }
 951
 952 static inline void direct_ref_list_init(H264Context * const h){
 953     MpegEncContext * const s = &h->s;
 954     Picture * const ref1 = &h->ref_list[1][0];
 955     Picture * const cur = s->current_picture_ptr;
 956     int list, j, field;
 957     int sidx= (s->picture_structure&1)^1;
 958     int ref1sidx= (ref1->reference&1)^1;
 959
 960     for(list=0; list<2; list++){
 961         cur->ref_count[sidx][list] = h->ref_count[list];
 962         for(j=0; j<h->ref_count[list]; j++)
 963             cur->ref_poc[sidx][list][j] = 4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3);
 964     }
 965
 966     if(s->picture_structure == PICT_FRAME){
 967         memcpy(cur->ref_count[1], cur->ref_count[0], sizeof(cur->ref_count[0]));
 968         memcpy(cur->ref_poc  [1], cur->ref_poc  [0], sizeof(cur->ref_poc  [0]));
 969     }
 970
 971     cur->mbaff= FRAME_MBAFF;
 972
 973     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 974         return;
 975
 976     for(list=0; list<2; list++){
 977         fill_colmap(h, h->map_col_to_list0, list, sidx, ref1sidx, 0);
 978         for(field=0; field<2; field++)
 979             fill_colmap(h, h->map_col_to_list0_field[field], list, field, field, 1);
 980     }
 981 }
 982
 983 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 984     MpegEncContext * const s = &h->s;
 985     int b8_stride = h->b8_stride;
 986     int b4_stride = h->b_stride;
 987     int mb_xy = h->mb_xy;
 988     int mb_type_col[2];
 989     const int16_t (*l1mv0)[2], (*l1mv1)[2];
 990     const int8_t *l1ref0, *l1ref1;
 991     const int is_b8x8 = IS_8X8(*mb_type);
 992     unsigned int sub_mb_type;
 993     int i8, i4;
 994
 995 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
 996
 997     if(IS_INTERLACED(h->ref_list[1][0].mb_type[mb_xy])){ // AFL/AFR/FR/FL -> AFL/FL
 998         if(!IS_INTERLACED(*mb_type)){                    //     AFR/FR    -> AFL/FL
 999             int cur_poc = s->current_picture_ptr->poc;
1000             int *col_poc = h->ref_list[1]->field_poc;
1001             int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
1002             mb_xy= s->mb_x + ((s->mb_y&~1) + col_parity)*s->mb_stride;
1003             b8_stride = 0;
1004         }else if(!(s->picture_structure & h->ref_list[1][0].reference) && !h->ref_list[1][0].mbaff){// FL -> FL & differ parity
1005             int fieldoff= 2*(h->ref_list[1][0].reference)-3;
1006             mb_xy += s->mb_stride*fieldoff;
1007         }
1008         goto single_col;
1009     }else{                                               // AFL/AFR/FR/FL -> AFR/FR
1010         if(IS_INTERLACED(*mb_type)){                     // AFL       /FL -> AFR/FR
1011             mb_xy= s->mb_x + (s->mb_y&~1)*s->mb_stride;
1012             mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy];
1013             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride];
1014             b8_stride *= 3;
1015             b4_stride *= 6;
1016             //FIXME IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag
1017             if(    (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)
1018                 && (mb_type_col[1] & MB_TYPE_16x16_OR_INTRA)
1019                 && !is_b8x8){
1020                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1021                 *mb_type   |= MB_TYPE_16x8 |MB_TYPE_L0L1|MB_TYPE_DIRECT2; /* B_16x8 */
1022             }else{
1023                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1024                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1025             }
1026         }else{                                           //     AFR/FR    -> AFR/FR
1027 single_col:
1028             mb_type_col[0] =
1029             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy];
1030             if(IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag){
1031                 /* FIXME save sub mb types from previous frames (or derive from MVs)
1032                 * so we know exactly what block size to use */
1033                 sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1034                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1035             }else if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
1036                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1037                 *mb_type   |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1038             }else{
1039                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1040                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1041             }
1042         }
1043     }
1044
1045     l1mv0  = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]];
1046     l1mv1  = &h->ref_list[1][0].motion_val[1][h->mb2b_xy [mb_xy]];
1047     l1ref0 = &h->ref_list[1][0].ref_index [0][h->mb2b8_xy[mb_xy]];
1048     l1ref1 = &h->ref_list[1][0].ref_index [1][h->mb2b8_xy[mb_xy]];
1049     if(!b8_stride){
1050         if(s->mb_y&1){
1051             l1ref0 += h->b8_stride;
1052             l1ref1 += h->b8_stride;
1053             l1mv0  +=  2*b4_stride;
1054             l1mv1  +=  2*b4_stride;
1055         }
1056     }
1057
1058     if(h->direct_spatial_mv_pred){
1059         int ref[2];
1060         int mv[2][2];
1061         int list;
1062
1063         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1064
1065         /* ref = min(neighbors) */
1066         for(list=0; list<2; list++){
1067             int refa = h->ref_cache[list][scan8[0] - 1];
1068             int refb = h->ref_cache[list][scan8[0] - 8];
1069             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1070             if(refc == PART_NOT_AVAILABLE)
1071                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1072             ref[list] = FFMIN3((unsigned)refa, (unsigned)refb, (unsigned)refc);
1073             if(ref[list] < 0)
1074                 ref[list] = -1;
1075         }
1076
1077         if(ref[0] < 0 && ref[1] < 0){
1078             ref[0] = ref[1] = 0;
1079             mv[0][0] = mv[0][1] =
1080             mv[1][0] = mv[1][1] = 0;
1081         }else{
1082             for(list=0; list<2; list++){
1083                 if(ref[list] >= 0)
1084                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1085                 else
1086                     mv[list][0] = mv[list][1] = 0;
1087             }
1088         }
1089
1090         if(ref[1] < 0){
1091             if(!is_b8x8)
1092                 *mb_type &= ~MB_TYPE_L1;
1093             sub_mb_type &= ~MB_TYPE_L1;
1094         }else if(ref[0] < 0){
1095             if(!is_b8x8)
1096                 *mb_type &= ~MB_TYPE_L0;
1097             sub_mb_type &= ~MB_TYPE_L0;
1098         }
1099
1100         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1101             for(i8=0; i8<4; i8++){
1102                 int x8 = i8&1;
1103                 int y8 = i8>>1;
1104                 int xy8 = x8+y8*b8_stride;
1105                 int xy4 = 3*x8+y8*b4_stride;
1106                 int a=0, b=0;
1107
1108                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1109                     continue;
1110                 h->sub_mb_type[i8] = sub_mb_type;
1111
1112                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1113                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1114                 if(!IS_INTRA(mb_type_col[y8])
1115                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1116                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1117                     if(ref[0] > 0)
1118                         a= pack16to32(mv[0][0],mv[0][1]);
1119                     if(ref[1] > 0)
1120                         b= pack16to32(mv[1][0],mv[1][1]);
1121                 }else{
1122                     a= pack16to32(mv[0][0],mv[0][1]);
1123                     b= pack16to32(mv[1][0],mv[1][1]);
1124                 }
1125                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1126                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1127             }
1128         }else if(IS_16X16(*mb_type)){
1129             int a=0, b=0;
1130
1131             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1132             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1133             if(!IS_INTRA(mb_type_col[0])
1134                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1135                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1136                        && (h->x264_build>33 || !h->x264_build)))){
1137                 if(ref[0] > 0)
1138                     a= pack16to32(mv[0][0],mv[0][1]);
1139                 if(ref[1] > 0)
1140                     b= pack16to32(mv[1][0],mv[1][1]);
1141             }else{
1142                 a= pack16to32(mv[0][0],mv[0][1]);
1143                 b= pack16to32(mv[1][0],mv[1][1]);
1144             }
1145             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1146             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1147         }else{
1148             for(i8=0; i8<4; i8++){
1149                 const int x8 = i8&1;
1150                 const int y8 = i8>>1;
1151
1152                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1153                     continue;
1154                 h->sub_mb_type[i8] = sub_mb_type;
1155
1156                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1157                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1158                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1159                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1160
1161                 /* col_zero_flag */
1162                 if(!IS_INTRA(mb_type_col[0]) && (   l1ref0[x8 + y8*b8_stride] == 0
1163                                               || (l1ref0[x8 + y8*b8_stride] < 0 && l1ref1[x8 + y8*b8_stride] == 0
1164                                                   && (h->x264_build>33 || !h->x264_build)))){
1165                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*b8_stride] == 0 ? l1mv0 : l1mv1;
1166                     if(IS_SUB_8X8(sub_mb_type)){
1167                         const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1168                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1169                             if(ref[0] == 0)
1170                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1171                             if(ref[1] == 0)
1172                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1173                         }
1174                     }else
1175                     for(i4=0; i4<4; i4++){
1176                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1177                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1178                             if(ref[0] == 0)
1179                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1180                             if(ref[1] == 0)
1181                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1182                         }
1183                     }
1184                 }
1185             }
1186         }
1187     }else{ /* direct temporal mv pred */
1188         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1189         const int *dist_scale_factor = h->dist_scale_factor;
1190         int ref_offset= 0;
1191
1192         if(FRAME_MBAFF && IS_INTERLACED(*mb_type)){
1193             map_col_to_list0[0] = h->map_col_to_list0_field[s->mb_y&1][0];
1194             map_col_to_list0[1] = h->map_col_to_list0_field[s->mb_y&1][1];
1195             dist_scale_factor   =h->dist_scale_factor_field[s->mb_y&1];
1196         }
1197         if(h->ref_list[1][0].mbaff && IS_INTERLACED(mb_type_col[0]))
1198             ref_offset += 16;
1199
1200         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1201             /* FIXME assumes direct_8x8_inference == 1 */
1202             int y_shift  = 2*!IS_INTERLACED(*mb_type);
1203
1204             for(i8=0; i8<4; i8++){
1205                 const int x8 = i8&1;
1206                 const int y8 = i8>>1;
1207                 int ref0, scale;
1208                 const int16_t (*l1mv)[2]= l1mv0;
1209
1210                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1211                     continue;
1212                 h->sub_mb_type[i8] = sub_mb_type;
1213
1214                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1215                 if(IS_INTRA(mb_type_col[y8])){
1216                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1217                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1218                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1219                     continue;
1220                 }
1221
1222                 ref0 = l1ref0[x8 + y8*b8_stride];
1223                 if(ref0 >= 0)
1224                     ref0 = map_col_to_list0[0][ref0 + ref_offset];
1225                 else{
1226                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1227                     l1mv= l1mv1;
1228                 }
1229                 scale = dist_scale_factor[ref0];
1230                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1231
1232                 {
1233                     const int16_t *mv_col = l1mv[x8*3 + y8*b4_stride];
1234                     int my_col = (mv_col[1]<<y_shift)/2;
1235                     int mx = (scale * mv_col[0] + 128) >> 8;
1236                     int my = (scale * my_col + 128) >> 8;
1237                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1238                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1239                 }
1240             }
1241             return;
1242         }
1243
1244         /* one-to-one mv scaling */
1245
1246         if(IS_16X16(*mb_type)){
1247             int ref, mv0, mv1;
1248
1249             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1250             if(IS_INTRA(mb_type_col[0])){
1251                 ref=mv0=mv1=0;
1252             }else{
1253                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0] + ref_offset]
1254                                                 : map_col_to_list0[1][l1ref1[0] + ref_offset];
1255                 const int scale = dist_scale_factor[ref0];
1256                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1257                 int mv_l0[2];
1258                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1259                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1260                 ref= ref0;
1261                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1262                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1263             }
1264             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1265             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1266             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1267         }else{
1268             for(i8=0; i8<4; i8++){
1269                 const int x8 = i8&1;
1270                 const int y8 = i8>>1;
1271                 int ref0, scale;
1272                 const int16_t (*l1mv)[2]= l1mv0;
1273
1274                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1275                     continue;
1276                 h->sub_mb_type[i8] = sub_mb_type;
1277                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1278                 if(IS_INTRA(mb_type_col[0])){
1279                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1280                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1281                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1282                     continue;
1283                 }
1284
1285                 ref0 = l1ref0[x8 + y8*b8_stride] + ref_offset;
1286                 if(ref0 >= 0)
1287                     ref0 = map_col_to_list0[0][ref0];
1288                 else{
1289                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1290                     l1mv= l1mv1;
1291                 }
1292                 scale = dist_scale_factor[ref0];
1293
1294                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1295                 if(IS_SUB_8X8(sub_mb_type)){
1296                     const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1297                     int mx = (scale * mv_col[0] + 128) >> 8;
1298                     int my = (scale * mv_col[1] + 128) >> 8;
1299                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1300                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1301                 }else
1302                 for(i4=0; i4<4; i4++){
1303                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1304                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1305                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1306                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1307                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1308                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1309                 }
1310             }
1311         }
1312     }
1313 }
1314
1315 static inline void write_back_motion(H264Context *h, int mb_type){
1316     MpegEncContext * const s = &h->s;
1317     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1318     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1319     int list;
1320
1321     if(!USES_LIST(mb_type, 0))
1322         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1323
1324     for(list=0; list<h->list_count; list++){
1325         int y;
1326         if(!USES_LIST(mb_type, list))
1327             continue;
1328
1329         for(y=0; y<4; y++){
1330             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1331             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1332         }
1333         if( h->pps.cabac ) {
1334             if(IS_SKIP(mb_type))
1335                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1336             else
1337             for(y=0; y<4; y++){
1338                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1339                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1340             }
1341         }
1342
1343         {
1344             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1345             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1346             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1347             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1348             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1349         }
1350     }
1351
1352     if(h->slice_type_nos == FF_B_TYPE && h->pps.cabac){
1353         if(IS_8X8(mb_type)){
1354             uint8_t *direct_table = &h->direct_table[b8_xy];
1355             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1356             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1357             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1358         }
1359     }
1360 }
1361
1362 /**
1363  * Decodes a network abstraction layer unit.
1364  * @param consumed is the number of bytes used as input
1365  * @param length is the length of the array
1366  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1367  * @returns decoded bytes, might be src+1 if no escapes
1368  */
1369 static const uint8_t *decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1370     int i, si, di;
1371     uint8_t *dst;
1372     int bufidx;
1373
1374 //    src[0]&0x80;                //forbidden bit
1375     h->nal_ref_idc= src[0]>>5;
1376     h->nal_unit_type= src[0]&0x1F;
1377
1378     src++; length--;
1379 #if 0
1380     for(i=0; i<length; i++)
1381         printf("%2X ", src[i]);
1382 #endif
1383
1384 #if HAVE_FAST_UNALIGNED
1385 # if HAVE_FAST_64BIT
1386 #   define RS 7
1387     for(i=0; i+1<length; i+=9){
1388         if(!((~*(uint64_t*)(src+i) & (*(uint64_t*)(src+i) - 0x0100010001000101ULL)) & 0x8000800080008080ULL))
1389 # else
1390 #   define RS 3
1391     for(i=0; i+1<length; i+=5){
1392         if(!((~*(uint32_t*)(src+i) & (*(uint32_t*)(src+i) - 0x01000101U)) & 0x80008080U))
1393 # endif
1394             continue;
1395         if(i>0 && !src[i]) i--;
1396         while(src[i]) i++;
1397 #else
1398 #   define RS 0
1399     for(i=0; i+1<length; i+=2){
1400         if(src[i]) continue;
1401         if(i>0 && src[i-1]==0) i--;
1402 #endif
1403         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1404             if(src[i+2]!=3){
1405                 /* startcode, so we must be past the end */
1406                 length=i;
1407             }
1408             break;
1409         }
1410         i-= RS;
1411     }
1412
1413     if(i>=length-1){ //no escaped 0
1414         *dst_length= length;
1415         *consumed= length+1; //+1 for the header
1416         return src;
1417     }
1418
1419     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1420     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length+FF_INPUT_BUFFER_PADDING_SIZE);
1421     dst= h->rbsp_buffer[bufidx];
1422
1423     if (dst == NULL){
1424         return NULL;
1425     }
1426
1427 //printf("decoding esc\n");
1428     memcpy(dst, src, i);
1429     si=di=i;
1430     while(si+2<length){
1431         //remove escapes (very rare 1:2^22)
1432         if(src[si+2]>3){
1433             dst[di++]= src[si++];
1434             dst[di++]= src[si++];
1435         }else if(src[si]==0 && src[si+1]==0){
1436             if(src[si+2]==3){ //escape
1437                 dst[di++]= 0;
1438                 dst[di++]= 0;
1439                 si+=3;
1440                 continue;
1441             }else //next start code
1442                 goto nsc;
1443         }
1444
1445         dst[di++]= src[si++];
1446     }
1447     while(si<length)
1448         dst[di++]= src[si++];
1449 nsc:
1450
1451     memset(dst+di, 0, FF_INPUT_BUFFER_PADDING_SIZE);
1452
1453     *dst_length= di;
1454     *consumed= si + 1;//+1 for the header
1455 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1456     return dst;
1457 }
1458
1459 /**
1460  * identifies the exact end of the bitstream
1461  * @return the length of the trailing, or 0 if damaged
1462  */
1463 static int decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1464     int v= *src;
1465     int r;
1466
1467     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1468
1469     for(r=1; r<9; r++){
1470         if(v&1) return r;
1471         v>>=1;
1472     }
1473     return 0;
1474 }
1475
1476 /**
1477  * IDCT transforms the 16 dc values and dequantizes them.
1478  * @param qp quantization parameter
1479  */
1480 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1481 #define stride 16
1482     int i;
1483     int temp[16]; //FIXME check if this is a good idea
1484     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1485     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1486
1487 //memset(block, 64, 2*256);
1488 //return;
1489     for(i=0; i<4; i++){
1490         const int offset= y_offset[i];
1491         const int z0= block[offset+stride*0] + block[offset+stride*4];
1492         const int z1= block[offset+stride*0] - block[offset+stride*4];
1493         const int z2= block[offset+stride*1] - block[offset+stride*5];
1494         const int z3= block[offset+stride*1] + block[offset+stride*5];
1495
1496         temp[4*i+0]= z0+z3;
1497         temp[4*i+1]= z1+z2;
1498         temp[4*i+2]= z1-z2;
1499         temp[4*i+3]= z0-z3;
1500     }
1501
1502     for(i=0; i<4; i++){
1503         const int offset= x_offset[i];
1504         const int z0= temp[4*0+i] + temp[4*2+i];
1505         const int z1= temp[4*0+i] - temp[4*2+i];
1506         const int z2= temp[4*1+i] - temp[4*3+i];
1507         const int z3= temp[4*1+i] + temp[4*3+i];
1508
1509         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
1510         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1511         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1512         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1513     }
1514 }
1515
1516 #if 0
1517 /**
1518  * DCT transforms the 16 dc values.
1519  * @param qp quantization parameter ??? FIXME
1520  */
1521 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1522 //    const int qmul= dequant_coeff[qp][0];
1523     int i;
1524     int temp[16]; //FIXME check if this is a good idea
1525     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1526     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1527
1528     for(i=0; i<4; i++){
1529         const int offset= y_offset[i];
1530         const int z0= block[offset+stride*0] + block[offset+stride*4];
1531         const int z1= block[offset+stride*0] - block[offset+stride*4];
1532         const int z2= block[offset+stride*1] - block[offset+stride*5];
1533         const int z3= block[offset+stride*1] + block[offset+stride*5];
1534
1535         temp[4*i+0]= z0+z3;
1536         temp[4*i+1]= z1+z2;
1537         temp[4*i+2]= z1-z2;
1538         temp[4*i+3]= z0-z3;
1539     }
1540
1541     for(i=0; i<4; i++){
1542         const int offset= x_offset[i];
1543         const int z0= temp[4*0+i] + temp[4*2+i];
1544         const int z1= temp[4*0+i] - temp[4*2+i];
1545         const int z2= temp[4*1+i] - temp[4*3+i];
1546         const int z3= temp[4*1+i] + temp[4*3+i];
1547
1548         block[stride*0 +offset]= (z0 + z3)>>1;
1549         block[stride*2 +offset]= (z1 + z2)>>1;
1550         block[stride*8 +offset]= (z1 - z2)>>1;
1551         block[stride*10+offset]= (z0 - z3)>>1;
1552     }
1553 }
1554 #endif
1555
1556 #undef xStride
1557 #undef stride
1558
1559 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1560     const int stride= 16*2;
1561     const int xStride= 16;
1562     int a,b,c,d,e;
1563
1564     a= block[stride*0 + xStride*0];
1565     b= block[stride*0 + xStride*1];
1566     c= block[stride*1 + xStride*0];
1567     d= block[stride*1 + xStride*1];
1568
1569     e= a-b;
1570     a= a+b;
1571     b= c-d;
1572     c= c+d;
1573
1574     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1575     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1576     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1577     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1578 }
1579
1580 #if 0
1581 static void chroma_dc_dct_c(DCTELEM *block){
1582     const int stride= 16*2;
1583     const int xStride= 16;
1584     int a,b,c,d,e;
1585
1586     a= block[stride*0 + xStride*0];
1587     b= block[stride*0 + xStride*1];
1588     c= block[stride*1 + xStride*0];
1589     d= block[stride*1 + xStride*1];
1590
1591     e= a-b;
1592     a= a+b;
1593     b= c-d;
1594     c= c+d;
1595
1596     block[stride*0 + xStride*0]= (a+c);
1597     block[stride*0 + xStride*1]= (e+b);
1598     block[stride*1 + xStride*0]= (a-c);
1599     block[stride*1 + xStride*1]= (e-b);
1600 }
1601 #endif
1602
1603 /**
1604  * gets the chroma qp.
1605  */
1606 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1607     return h->pps.chroma_qp_table[t][qscale];
1608 }
1609
1610 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1611                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1612                            int src_x_offset, int src_y_offset,
1613                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1614     MpegEncContext * const s = &h->s;
1615     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1616     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1617     const int luma_xy= (mx&3) + ((my&3)<<2);
1618     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1619     uint8_t * src_cb, * src_cr;
1620     int extra_width= h->emu_edge_width;
1621     int extra_height= h->emu_edge_height;
1622     int emu=0;
1623     const int full_mx= mx>>2;
1624     const int full_my= my>>2;
1625     const int pic_width  = 16*s->mb_width;
1626     const int pic_height = 16*s->mb_height >> MB_FIELD;
1627
1628     if(mx&7) extra_width -= 3;
1629     if(my&7) extra_height -= 3;
1630
1631     if(   full_mx < 0-extra_width
1632        || full_my < 0-extra_height
1633        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1634        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1635         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1636             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1637         emu=1;
1638     }
1639
1640     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1641     if(!square){
1642         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1643     }
1644
1645     if(CONFIG_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1646
1647     if(MB_FIELD){
1648         // chroma offset when predicting from a field of opposite parity
1649         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1650         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1651     }
1652     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1653     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1654
1655     if(emu){
1656         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1657             src_cb= s->edge_emu_buffer;
1658     }
1659     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1660
1661     if(emu){
1662         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1663             src_cr= s->edge_emu_buffer;
1664     }
1665     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1666 }
1667
1668 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1669                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1670                            int x_offset, int y_offset,
1671                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1672                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1673                            int list0, int list1){
1674     MpegEncContext * const s = &h->s;
1675     qpel_mc_func *qpix_op=  qpix_put;
1676     h264_chroma_mc_func chroma_op= chroma_put;
1677
1678     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1679     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1680     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1681     x_offset += 8*s->mb_x;
1682     y_offset += 8*(s->mb_y >> MB_FIELD);
1683
1684     if(list0){
1685         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1686         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1687                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1688                            qpix_op, chroma_op);
1689
1690         qpix_op=  qpix_avg;
1691         chroma_op= chroma_avg;
1692     }
1693
1694     if(list1){
1695         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1696         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1697                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1698                            qpix_op, chroma_op);
1699     }
1700 }
1701
1702 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1703                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1704                            int x_offset, int y_offset,
1705                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1706                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1707                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1708                            int list0, int list1){
1709     MpegEncContext * const s = &h->s;
1710
1711     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1712     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1713     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1714     x_offset += 8*s->mb_x;
1715     y_offset += 8*(s->mb_y >> MB_FIELD);
1716
1717     if(list0 && list1){
1718         /* don't optimize for luma-only case, since B-frames usually
1719          * use implicit weights => chroma too. */
1720         uint8_t *tmp_cb = s->obmc_scratchpad;
1721         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1722         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1723         int refn0 = h->ref_cache[0][ scan8[n] ];
1724         int refn1 = h->ref_cache[1][ scan8[n] ];
1725
1726         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1727                     dest_y, dest_cb, dest_cr,
1728                     x_offset, y_offset, qpix_put, chroma_put);
1729         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1730                     tmp_y, tmp_cb, tmp_cr,
1731                     x_offset, y_offset, qpix_put, chroma_put);
1732
1733         if(h->use_weight == 2){
1734             int weight0 = h->implicit_weight[refn0][refn1];
1735             int weight1 = 64 - weight0;
1736             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1737             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1738             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1739         }else{
1740             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1741                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1742                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1743             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1744                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1745                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1746             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1747                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1748                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1749         }
1750     }else{
1751         int list = list1 ? 1 : 0;
1752         int refn = h->ref_cache[list][ scan8[n] ];
1753         Picture *ref= &h->ref_list[list][refn];
1754         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1755                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1756                     qpix_put, chroma_put);
1757
1758         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1759                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1760         if(h->use_weight_chroma){
1761             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1762                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1763             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1764                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1765         }
1766     }
1767 }
1768
1769 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1770                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1771                            int x_offset, int y_offset,
1772                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1773                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1774                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1775                            int list0, int list1){
1776     if((h->use_weight==2 && list0 && list1
1777         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1778        || h->use_weight==1)
1779         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1780                          x_offset, y_offset, qpix_put, chroma_put,
1781                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1782     else
1783         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1784                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1785 }
1786
1787 static inline void prefetch_motion(H264Context *h, int list){
1788     /* fetch pixels for estimated mv 4 macroblocks ahead
1789      * optimized for 64byte cache lines */
1790     MpegEncContext * const s = &h->s;
1791     const int refn = h->ref_cache[list][scan8[0]];
1792     if(refn >= 0){
1793         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1794         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1795         uint8_t **src= h->ref_list[list][refn].data;
1796         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1797         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1798         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1799         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1800     }
1801 }
1802
1803 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1804                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1805                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1806                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1807     MpegEncContext * const s = &h->s;
1808     const int mb_xy= h->mb_xy;
1809     const int mb_type= s->current_picture.mb_type[mb_xy];
1810
1811     assert(IS_INTER(mb_type));
1812
1813     prefetch_motion(h, 0);
1814
1815     if(IS_16X16(mb_type)){
1816         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1817                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1818                 &weight_op[0], &weight_avg[0],
1819                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1820     }else if(IS_16X8(mb_type)){
1821         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1822                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1823                 &weight_op[1], &weight_avg[1],
1824                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1825         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1826                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1827                 &weight_op[1], &weight_avg[1],
1828                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1829     }else if(IS_8X16(mb_type)){
1830         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1831                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1832                 &weight_op[2], &weight_avg[2],
1833                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1834         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1835                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1836                 &weight_op[2], &weight_avg[2],
1837                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1838     }else{
1839         int i;
1840
1841         assert(IS_8X8(mb_type));
1842
1843         for(i=0; i<4; i++){
1844             const int sub_mb_type= h->sub_mb_type[i];
1845             const int n= 4*i;
1846             int x_offset= (i&1)<<2;
1847             int y_offset= (i&2)<<1;
1848
1849             if(IS_SUB_8X8(sub_mb_type)){
1850                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1851                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1852                     &weight_op[3], &weight_avg[3],
1853                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1854             }else if(IS_SUB_8X4(sub_mb_type)){
1855                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1856                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1857                     &weight_op[4], &weight_avg[4],
1858                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1859                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1860                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1861                     &weight_op[4], &weight_avg[4],
1862                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1863             }else if(IS_SUB_4X8(sub_mb_type)){
1864                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1865                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1866                     &weight_op[5], &weight_avg[5],
1867                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1868                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1869                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1870                     &weight_op[5], &weight_avg[5],
1871                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1872             }else{
1873                 int j;
1874                 assert(IS_SUB_4X4(sub_mb_type));
1875                 for(j=0; j<4; j++){
1876                     int sub_x_offset= x_offset + 2*(j&1);
1877                     int sub_y_offset= y_offset +   (j&2);
1878                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1879                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1880                         &weight_op[6], &weight_avg[6],
1881                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1882                 }
1883             }
1884         }
1885     }
1886
1887     prefetch_motion(h, 1);
1888 }
1889
1890 static av_cold void init_cavlc_level_tab(void){
1891     int suffix_length, mask;
1892     unsigned int i;
1893
1894     for(suffix_length=0; suffix_length<7; suffix_length++){
1895         for(i=0; i<(1<<LEVEL_TAB_BITS); i++){
1896             int prefix= LEVEL_TAB_BITS - av_log2(2*i);
1897             int level_code= (prefix<<suffix_length) + (i>>(LEVEL_TAB_BITS-prefix-1-suffix_length)) - (1<<suffix_length);
1898
1899             mask= -(level_code&1);
1900             level_code= (((2+level_code)>>1) ^ mask) - mask;
1901             if(prefix + 1 + suffix_length <= LEVEL_TAB_BITS){
1902                 cavlc_level_tab[suffix_length][i][0]= level_code;
1903                 cavlc_level_tab[suffix_length][i][1]= prefix + 1 + suffix_length;
1904             }else if(prefix + 1 <= LEVEL_TAB_BITS){
1905                 cavlc_level_tab[suffix_length][i][0]= prefix+100;
1906                 cavlc_level_tab[suffix_length][i][1]= prefix + 1;
1907             }else{
1908                 cavlc_level_tab[suffix_length][i][0]= LEVEL_TAB_BITS+100;
1909                 cavlc_level_tab[suffix_length][i][1]= LEVEL_TAB_BITS;
1910             }
1911         }
1912     }
1913 }
1914
1915 static av_cold void decode_init_vlc(void){
1916     static int done = 0;
1917
1918     if (!done) {
1919         int i;
1920         int offset;
1921         done = 1;
1922
1923         chroma_dc_coeff_token_vlc.table = chroma_dc_coeff_token_vlc_table;
1924         chroma_dc_coeff_token_vlc.table_allocated = chroma_dc_coeff_token_vlc_table_size;
1925         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1926                  &chroma_dc_coeff_token_len [0], 1, 1,
1927                  &chroma_dc_coeff_token_bits[0], 1, 1,
1928                  INIT_VLC_USE_NEW_STATIC);
1929
1930         offset = 0;
1931         for(i=0; i<4; i++){
1932             coeff_token_vlc[i].table = coeff_token_vlc_tables+offset;
1933             coeff_token_vlc[i].table_allocated = coeff_token_vlc_tables_size[i];
1934             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1935                      &coeff_token_len [i][0], 1, 1,
1936                      &coeff_token_bits[i][0], 1, 1,
1937                      INIT_VLC_USE_NEW_STATIC);
1938             offset += coeff_token_vlc_tables_size[i];
1939         }
1940         /*
1941          * This is a one time safety check to make sure that
1942          * the packed static coeff_token_vlc table sizes
1943          * were initialized correctly.
1944          */
1945         assert(offset == FF_ARRAY_ELEMS(coeff_token_vlc_tables));
1946
1947         for(i=0; i<3; i++){
1948             chroma_dc_total_zeros_vlc[i].table = chroma_dc_total_zeros_vlc_tables[i];
1949             chroma_dc_total_zeros_vlc[i].table_allocated = chroma_dc_total_zeros_vlc_tables_size;
1950             init_vlc(&chroma_dc_total_zeros_vlc[i],
1951                      CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1952                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1953                      &chroma_dc_total_zeros_bits[i][0], 1, 1,
1954                      INIT_VLC_USE_NEW_STATIC);
1955         }
1956         for(i=0; i<15; i++){
1957             total_zeros_vlc[i].table = total_zeros_vlc_tables[i];
1958             total_zeros_vlc[i].table_allocated = total_zeros_vlc_tables_size;
1959             init_vlc(&total_zeros_vlc[i],
1960                      TOTAL_ZEROS_VLC_BITS, 16,
1961                      &total_zeros_len [i][0], 1, 1,
1962                      &total_zeros_bits[i][0], 1, 1,
1963                      INIT_VLC_USE_NEW_STATIC);
1964         }
1965
1966         for(i=0; i<6; i++){
1967             run_vlc[i].table = run_vlc_tables[i];
1968             run_vlc[i].table_allocated = run_vlc_tables_size;
1969             init_vlc(&run_vlc[i],
1970                      RUN_VLC_BITS, 7,
1971                      &run_len [i][0], 1, 1,
1972                      &run_bits[i][0], 1, 1,
1973                      INIT_VLC_USE_NEW_STATIC);
1974         }
1975         run7_vlc.table = run7_vlc_table,
1976         run7_vlc.table_allocated = run7_vlc_table_size;
1977         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
1978                  &run_len [6][0], 1, 1,
1979                  &run_bits[6][0], 1, 1,
1980                  INIT_VLC_USE_NEW_STATIC);
1981
1982         init_cavlc_level_tab();
1983     }
1984 }
1985
1986 static void free_tables(H264Context *h){
1987     int i;
1988     H264Context *hx;
1989     av_freep(&h->intra4x4_pred_mode);
1990     av_freep(&h->chroma_pred_mode_table);
1991     av_freep(&h->cbp_table);
1992     av_freep(&h->mvd_table[0]);
1993     av_freep(&h->mvd_table[1]);
1994     av_freep(&h->direct_table);
1995     av_freep(&h->non_zero_count);
1996     av_freep(&h->slice_table_base);
1997     h->slice_table= NULL;
1998
1999     av_freep(&h->mb2b_xy);
2000     av_freep(&h->mb2b8_xy);
2001
2002     for(i = 0; i < h->s.avctx->thread_count; i++) {
2003         hx = h->thread_context[i];
2004         if(!hx) continue;
2005         av_freep(&hx->top_borders[1]);
2006         av_freep(&hx->top_borders[0]);
2007         av_freep(&hx->s.obmc_scratchpad);
2008     }
2009 }
2010
2011 static void init_dequant8_coeff_table(H264Context *h){
2012     int i,q,x;
2013     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2014     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2015     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2016
2017     for(i=0; i<2; i++ ){
2018         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2019             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2020             break;
2021         }
2022
2023         for(q=0; q<52; q++){
2024             int shift = div6[q];
2025             int idx = rem6[q];
2026             for(x=0; x<64; x++)
2027                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2028                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2029                     h->pps.scaling_matrix8[i][x]) << shift;
2030         }
2031     }
2032 }
2033
2034 static void init_dequant4_coeff_table(H264Context *h){
2035     int i,j,q,x;
2036     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2037     for(i=0; i<6; i++ ){
2038         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2039         for(j=0; j<i; j++){
2040             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2041                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2042                 break;
2043             }
2044         }
2045         if(j<i)
2046             continue;
2047
2048         for(q=0; q<52; q++){
2049             int shift = div6[q] + 2;
2050             int idx = rem6[q];
2051             for(x=0; x<16; x++)
2052                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2053                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2054                     h->pps.scaling_matrix4[i][x]) << shift;
2055         }
2056     }
2057 }
2058
2059 static void init_dequant_tables(H264Context *h){
2060     int i,x;
2061     init_dequant4_coeff_table(h);
2062     if(h->pps.transform_8x8_mode)
2063         init_dequant8_coeff_table(h);
2064     if(h->sps.transform_bypass){
2065         for(i=0; i<6; i++)
2066             for(x=0; x<16; x++)
2067                 h->dequant4_coeff[i][0][x] = 1<<6;
2068         if(h->pps.transform_8x8_mode)
2069             for(i=0; i<2; i++)
2070                 for(x=0; x<64; x++)
2071                     h->dequant8_coeff[i][0][x] = 1<<6;
2072     }
2073 }
2074
2075
2076 /**
2077  * allocates tables.
2078  * needs width/height
2079  */
2080 static int alloc_tables(H264Context *h){
2081     MpegEncContext * const s = &h->s;
2082     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2083     int x,y;
2084
2085     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2086
2087     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2088     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(*h->slice_table_base))
2089     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2090
2091     CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2092     CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2093     CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2094     CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2095
2096     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(*h->slice_table_base));
2097     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2098
2099     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2100     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2101     for(y=0; y<s->mb_height; y++){
2102         for(x=0; x<s->mb_width; x++){
2103             const int mb_xy= x + y*s->mb_stride;
2104             const int b_xy = 4*x + 4*y*h->b_stride;
2105             const int b8_xy= 2*x + 2*y*h->b8_stride;
2106
2107             h->mb2b_xy [mb_xy]= b_xy;
2108             h->mb2b8_xy[mb_xy]= b8_xy;
2109         }
2110     }
2111
2112     s->obmc_scratchpad = NULL;
2113
2114     if(!h->dequant4_coeff[0])
2115         init_dequant_tables(h);
2116
2117     return 0;
2118 fail:
2119     free_tables(h);
2120     return -1;
2121 }
2122
2123 /**
2124  * Mimic alloc_tables(), but for every context thread.
2125  */
2126 static void clone_tables(H264Context *dst, H264Context *src){
2127     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2128     dst->non_zero_count           = src->non_zero_count;
2129     dst->slice_table              = src->slice_table;
2130     dst->cbp_table                = src->cbp_table;
2131     dst->mb2b_xy                  = src->mb2b_xy;
2132     dst->mb2b8_xy                 = src->mb2b8_xy;
2133     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2134     dst->mvd_table[0]             = src->mvd_table[0];
2135     dst->mvd_table[1]             = src->mvd_table[1];
2136     dst->direct_table             = src->direct_table;
2137
2138     dst->s.obmc_scratchpad = NULL;
2139     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2140 }
2141
2142 /**
2143  * Init context
2144  * Allocate buffers which are not shared amongst multiple threads.
2145  */
2146 static int context_init(H264Context *h){
2147     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2148     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2149
2150     return 0;
2151 fail:
2152     return -1; // free_tables will clean up for us
2153 }
2154
2155 static av_cold void common_init(H264Context *h){
2156     MpegEncContext * const s = &h->s;
2157
2158     s->width = s->avctx->width;
2159     s->height = s->avctx->height;
2160     s->codec_id= s->avctx->codec->id;
2161
2162     ff_h264_pred_init(&h->hpc, s->codec_id);
2163
2164     h->dequant_coeff_pps= -1;
2165     s->unrestricted_mv=1;
2166     s->decode=1; //FIXME
2167
2168     dsputil_init(&s->dsp, s->avctx); // needed so that idct permutation is known early
2169
2170     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2171     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2172 }
2173
2174 static av_cold int decode_init(AVCodecContext *avctx){
2175     H264Context *h= avctx->priv_data;
2176     MpegEncContext * const s = &h->s;
2177
2178     MPV_decode_defaults(s);
2179
2180     s->avctx = avctx;
2181     common_init(h);
2182
2183     s->out_format = FMT_H264;
2184     s->workaround_bugs= avctx->workaround_bugs;
2185
2186     // set defaults
2187 //    s->decode_mb= ff_h263_decode_mb;
2188     s->quarter_sample = 1;
2189     s->low_delay= 1;
2190
2191     if(avctx->codec_id == CODEC_ID_SVQ3)
2192         avctx->pix_fmt= PIX_FMT_YUVJ420P;
2193     else if(s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
2194         avctx->pix_fmt= PIX_FMT_VDPAU_H264;
2195     else
2196         avctx->pix_fmt= PIX_FMT_YUV420P;
2197
2198     decode_init_vlc();
2199
2200     if(avctx->extradata_size > 0 && avctx->extradata &&
2201        *(char *)avctx->extradata == 1){
2202         h->is_avc = 1;
2203         h->got_avcC = 0;
2204     } else {
2205         h->is_avc = 0;
2206     }
2207
2208     h->thread_context[0] = h;
2209     h->outputed_poc = INT_MIN;
2210     h->prev_poc_msb= 1<<16;
2211     return 0;
2212 }
2213
2214 static int frame_start(H264Context *h){
2215     MpegEncContext * const s = &h->s;
2216     int i;
2217
2218     if(MPV_frame_start(s, s->avctx) < 0)
2219         return -1;
2220     ff_er_frame_start(s);
2221     /*
2222      * MPV_frame_start uses pict_type to derive key_frame.
2223      * This is incorrect for H.264; IDR markings must be used.
2224      * Zero here; IDR markings per slice in frame or fields are ORed in later.
2225      * See decode_nal_units().
2226      */
2227     s->current_picture_ptr->key_frame= 0;
2228
2229     assert(s->linesize && s->uvlinesize);
2230
2231     for(i=0; i<16; i++){
2232         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2233         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2234     }
2235     for(i=0; i<4; i++){
2236         h->block_offset[16+i]=
2237         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2238         h->block_offset[24+16+i]=
2239         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2240     }
2241
2242     /* can't be in alloc_tables because linesize isn't known there.
2243      * FIXME: redo bipred weight to not require extra buffer? */
2244     for(i = 0; i < s->avctx->thread_count; i++)
2245         if(!h->thread_context[i]->s.obmc_scratchpad)
2246             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2247
2248     /* some macroblocks will be accessed before they're available */
2249     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2250         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(*h->slice_table));
2251
2252 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2253
2254     // We mark the current picture as non-reference after allocating it, so
2255     // that if we break out due to an error it can be released automatically
2256     // in the next MPV_frame_start().
2257     // SVQ3 as well as most other codecs have only last/next/current and thus
2258     // get released even with set reference, besides SVQ3 and others do not
2259     // mark frames as reference later "naturally".
2260     if(s->codec_id != CODEC_ID_SVQ3)
2261         s->current_picture_ptr->reference= 0;
2262
2263     s->current_picture_ptr->field_poc[0]=
2264     s->current_picture_ptr->field_poc[1]= INT_MAX;
2265     assert(s->current_picture_ptr->long_ref==0);
2266
2267     return 0;
2268 }
2269
2270 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2271     MpegEncContext * const s = &h->s;
2272     int i;
2273     int step    = 1;
2274     int offset  = 1;
2275     int uvoffset= 1;
2276     int top_idx = 1;
2277     int skiplast= 0;
2278
2279     src_y  -=   linesize;
2280     src_cb -= uvlinesize;
2281     src_cr -= uvlinesize;
2282
2283     if(!simple && FRAME_MBAFF){
2284         if(s->mb_y&1){
2285             offset  = MB_MBAFF ? 1 : 17;
2286             uvoffset= MB_MBAFF ? 1 : 9;
2287             if(!MB_MBAFF){
2288                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 0)= *(uint64_t*)(src_y +  15*linesize);
2289                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 8)= *(uint64_t*)(src_y +8+15*linesize);
2290                 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2291                     *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+7*uvlinesize);
2292                     *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+7*uvlinesize);
2293                 }
2294             }
2295         }else{
2296             if(!MB_MBAFF){
2297                 h->left_border[0]= h->top_borders[0][s->mb_x][15];
2298                 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2299                     h->left_border[34   ]= h->top_borders[0][s->mb_x][16+7  ];
2300                     h->left_border[34+18]= h->top_borders[0][s->mb_x][16+8+7];
2301                 }
2302                 skiplast= 1;
2303             }
2304             offset  =
2305             uvoffset=
2306             top_idx = MB_MBAFF ? 0 : 1;
2307         }
2308         step= MB_MBAFF ? 2 : 1;
2309     }
2310
2311     // There are two lines saved, the line above the the top macroblock of a pair,
2312     // and the line above the bottom macroblock
2313     h->left_border[offset]= h->top_borders[top_idx][s->mb_x][15];
2314     for(i=1; i<17 - skiplast; i++){
2315         h->left_border[offset+i*step]= src_y[15+i*  linesize];
2316     }
2317
2318     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2319     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2320
2321     if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2322         h->left_border[uvoffset+34   ]= h->top_borders[top_idx][s->mb_x][16+7];
2323         h->left_border[uvoffset+34+18]= h->top_borders[top_idx][s->mb_x][24+7];
2324         for(i=1; i<9 - skiplast; i++){
2325             h->left_border[uvoffset+34   +i*step]= src_cb[7+i*uvlinesize];
2326             h->left_border[uvoffset+34+18+i*step]= src_cr[7+i*uvlinesize];
2327         }
2328         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2329         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2330     }
2331 }
2332
2333 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2334     MpegEncContext * const s = &h->s;
2335     int temp8, i;
2336     uint64_t temp64;
2337     int deblock_left;
2338     int deblock_top;
2339     int mb_xy;
2340     int step    = 1;
2341     int offset  = 1;
2342     int uvoffset= 1;
2343     int top_idx = 1;
2344
2345     if(!simple && FRAME_MBAFF){
2346         if(s->mb_y&1){
2347             offset  = MB_MBAFF ? 1 : 17;
2348             uvoffset= MB_MBAFF ? 1 : 9;
2349         }else{
2350             offset  =
2351             uvoffset=
2352             top_idx = MB_MBAFF ? 0 : 1;
2353         }
2354         step= MB_MBAFF ? 2 : 1;
2355     }
2356
2357     if(h->deblocking_filter == 2) {
2358         mb_xy = h->mb_xy;
2359         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2360         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2361     } else {
2362         deblock_left = (s->mb_x > 0);
2363         deblock_top =  (s->mb_y > !!MB_FIELD);
2364     }
2365
2366     src_y  -=   linesize + 1;
2367     src_cb -= uvlinesize + 1;
2368     src_cr -= uvlinesize + 1;
2369
2370 #define XCHG(a,b,t,xchg)\
2371 t= a;\
2372 if(xchg)\
2373     a= b;\
2374 b= t;
2375
2376     if(deblock_left){
2377         for(i = !deblock_top; i<16; i++){
2378             XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, xchg);
2379         }
2380         XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, 1);
2381     }
2382
2383     if(deblock_top){
2384         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2385         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2386         if(s->mb_x+1 < s->mb_width){
2387             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2388         }
2389     }
2390
2391     if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2392         if(deblock_left){
2393             for(i = !deblock_top; i<8; i++){
2394                 XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, xchg);
2395                 XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, xchg);
2396             }
2397             XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, 1);
2398             XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, 1);
2399         }
2400         if(deblock_top){
2401             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2402             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2403         }
2404     }
2405 }
2406
2407 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2408     MpegEncContext * const s = &h->s;
2409     const int mb_x= s->mb_x;
2410     const int mb_y= s->mb_y;
2411     const int mb_xy= h->mb_xy;
2412     const int mb_type= s->current_picture.mb_type[mb_xy];
2413     uint8_t  *dest_y, *dest_cb, *dest_cr;
2414     int linesize, uvlinesize /*dct_offset*/;
2415     int i;
2416     int *block_offset = &h->block_offset[0];
2417     const int transform_bypass = !simple && (s->qscale == 0 && h->sps.transform_bypass);
2418     /* is_h264 should always be true if SVQ3 is disabled. */
2419     const int is_h264 = !CONFIG_SVQ3_DECODER || simple || s->codec_id == CODEC_ID_H264;
2420     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2421     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2422
2423     dest_y  = s->current_picture.data[0] + (mb_x + mb_y * s->linesize  ) * 16;
2424     dest_cb = s->current_picture.data[1] + (mb_x + mb_y * s->uvlinesize) * 8;
2425     dest_cr = s->current_picture.data[2] + (mb_x + mb_y * s->uvlinesize) * 8;
2426
2427     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2428     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2429
2430     if (!simple && MB_FIELD) {
2431         linesize   = h->mb_linesize   = s->linesize * 2;
2432         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2433         block_offset = &h->block_offset[24];
2434         if(mb_y&1){ //FIXME move out of this function?
2435             dest_y -= s->linesize*15;
2436             dest_cb-= s->uvlinesize*7;
2437             dest_cr-= s->uvlinesize*7;
2438         }
2439         if(FRAME_MBAFF) {
2440             int list;
2441             for(list=0; list<h->list_count; list++){
2442                 if(!USES_LIST(mb_type, list))
2443                     continue;
2444                 if(IS_16X16(mb_type)){
2445                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2446                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2447                 }else{
2448                     for(i=0; i<16; i+=4){
2449                         int ref = h->ref_cache[list][scan8[i]];
2450                         if(ref >= 0)
2451                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2452                     }
2453                 }
2454             }
2455         }
2456     } else {
2457         linesize   = h->mb_linesize   = s->linesize;
2458         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2459 //        dct_offset = s->linesize * 16;
2460     }
2461
2462     if (!simple && IS_INTRA_PCM(mb_type)) {
2463         for (i=0; i<16; i++) {
2464             memcpy(dest_y + i*  linesize, h->mb       + i*8, 16);
2465         }
2466         for (i=0; i<8; i++) {
2467             memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
2468             memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
2469         }
2470     } else {
2471         if(IS_INTRA(mb_type)){
2472             if(h->deblocking_filter)
2473                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2474
2475             if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2476                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2477                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2478             }
2479
2480             if(IS_INTRA4x4(mb_type)){
2481                 if(simple || !s->encoding){
2482                     if(IS_8x8DCT(mb_type)){
2483                         if(transform_bypass){
2484                             idct_dc_add =
2485                             idct_add    = s->dsp.add_pixels8;
2486                         }else{
2487                             idct_dc_add = s->dsp.h264_idct8_dc_add;
2488                             idct_add    = s->dsp.h264_idct8_add;
2489                         }
2490                         for(i=0; i<16; i+=4){
2491                             uint8_t * const ptr= dest_y + block_offset[i];
2492                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2493                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2494                                 h->hpc.pred8x8l_add[dir](ptr, h->mb + i*16, linesize);
2495                             }else{
2496                                 const int nnz = h->non_zero_count_cache[ scan8[i] ];
2497                                 h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2498                                                             (h->topright_samples_available<<i)&0x4000, linesize);
2499                                 if(nnz){
2500                                     if(nnz == 1 && h->mb[i*16])
2501                                         idct_dc_add(ptr, h->mb + i*16, linesize);
2502                                     else
2503                                         idct_add   (ptr, h->mb + i*16, linesize);
2504                                 }
2505                             }
2506                         }
2507                     }else{
2508                         if(transform_bypass){
2509                             idct_dc_add =
2510                             idct_add    = s->dsp.add_pixels4;
2511                         }else{
2512                             idct_dc_add = s->dsp.h264_idct_dc_add;
2513                             idct_add    = s->dsp.h264_idct_add;
2514                         }
2515                         for(i=0; i<16; i++){
2516                             uint8_t * const ptr= dest_y + block_offset[i];
2517                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2518
2519                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2520                                 h->hpc.pred4x4_add[dir](ptr, h->mb + i*16, linesize);
2521                             }else{
2522                                 uint8_t *topright;
2523                                 int nnz, tr;
2524                                 if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2525                                     const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2526                                     assert(mb_y || linesize <= block_offset[i]);
2527                                     if(!topright_avail){
2528                                         tr= ptr[3 - linesize]*0x01010101;
2529                                         topright= (uint8_t*) &tr;
2530                                     }else
2531                                         topright= ptr + 4 - linesize;
2532                                 }else
2533                                     topright= NULL;
2534
2535                                 h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2536                                 nnz = h->non_zero_count_cache[ scan8[i] ];
2537                                 if(nnz){
2538                                     if(is_h264){
2539                                         if(nnz == 1 && h->mb[i*16])
2540                                             idct_dc_add(ptr, h->mb + i*16, linesize);
2541                                         else
2542                                             idct_add   (ptr, h->mb + i*16, linesize);
2543                                     }else
2544                                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2545                                 }
2546                             }
2547                         }
2548                     }
2549                 }
2550             }else{
2551                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2552                 if(is_h264){
2553                     if(!transform_bypass)
2554                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2555                 }else
2556                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2557             }
2558             if(h->deblocking_filter)
2559                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2560         }else if(is_h264){
2561             hl_motion(h, dest_y, dest_cb, dest_cr,
2562                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2563                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2564                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2565         }
2566
2567
2568         if(!IS_INTRA4x4(mb_type)){
2569             if(is_h264){
2570                 if(IS_INTRA16x16(mb_type)){
2571                     if(transform_bypass){
2572                         if(h->sps.profile_idc==244 && (h->intra16x16_pred_mode==VERT_PRED8x8 || h->intra16x16_pred_mode==HOR_PRED8x8)){
2573                             h->hpc.pred16x16_add[h->intra16x16_pred_mode](dest_y, block_offset, h->mb, linesize);
2574                         }else{
2575                             for(i=0; i<16; i++){
2576                                 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2577                                     s->dsp.add_pixels4(dest_y + block_offset[i], h->mb + i*16, linesize);
2578                             }
2579                         }
2580                     }else{
2581                          s->dsp.h264_idct_add16intra(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2582                     }
2583                 }else if(h->cbp&15){
2584                     if(transform_bypass){
2585                         const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2586                         idct_add= IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2587                         for(i=0; i<16; i+=di){
2588                             if(h->non_zero_count_cache[ scan8[i] ]){
2589                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2590                             }
2591                         }
2592                     }else{
2593                         if(IS_8x8DCT(mb_type)){
2594                             s->dsp.h264_idct8_add4(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2595                         }else{
2596                             s->dsp.h264_idct_add16(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2597                         }
2598                     }
2599                 }
2600             }else{
2601                 for(i=0; i<16; i++){
2602                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2603                         uint8_t * const ptr= dest_y + block_offset[i];
2604                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2605                     }
2606                 }
2607             }
2608         }
2609
2610         if((simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)) && (h->cbp&0x30)){
2611             uint8_t *dest[2] = {dest_cb, dest_cr};
2612             if(transform_bypass){
2613                 if(IS_INTRA(mb_type) && h->sps.profile_idc==244 && (h->chroma_pred_mode==VERT_PRED8x8 || h->chroma_pred_mode==HOR_PRED8x8)){
2614                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0], block_offset + 16, h->mb + 16*16, uvlinesize);
2615                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1], block_offset + 20, h->mb + 20*16, uvlinesize);
2616                 }else{
2617                     idct_add = s->dsp.add_pixels4;
2618                     for(i=16; i<16+8; i++){
2619                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2620                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2621                     }
2622                 }
2623             }else{
2624                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2625                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2626                 if(is_h264){
2627                     idct_add = s->dsp.h264_idct_add;
2628                     idct_dc_add = s->dsp.h264_idct_dc_add;
2629                     for(i=16; i<16+8; i++){
2630                         if(h->non_zero_count_cache[ scan8[i] ])
2631                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2632                         else if(h->mb[i*16])
2633                             idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2634                     }
2635                 }else{
2636                     for(i=16; i<16+8; i++){
2637                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2638                             uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2639                             svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2640                         }
2641                     }
2642                 }
2643             }
2644         }
2645     }
2646     if(h->cbp || IS_INTRA(mb_type))
2647         s->dsp.clear_blocks(h->mb);
2648
2649     if(h->deblocking_filter) {
2650         backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2651         fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2652         h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2653         h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2654         if (!simple && FRAME_MBAFF) {
2655             filter_mb     (h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2656         } else {
2657             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2658         }
2659     }
2660 }
2661
2662 /**
2663  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2664  */
2665 static void hl_decode_mb_simple(H264Context *h){
2666     hl_decode_mb_internal(h, 1);
2667 }
2668
2669 /**
2670  * Process a macroblock; this handles edge cases, such as interlacing.
2671  */
2672 static void av_noinline hl_decode_mb_complex(H264Context *h){
2673     hl_decode_mb_internal(h, 0);
2674 }
2675
2676 static void hl_decode_mb(H264Context *h){
2677     MpegEncContext * const s = &h->s;
2678     const int mb_xy= h->mb_xy;
2679     const int mb_type= s->current_picture.mb_type[mb_xy];
2680     int is_complex = CONFIG_SMALL || h->is_complex || IS_INTRA_PCM(mb_type) || s->qscale == 0;
2681
2682     if(CONFIG_H264_ENCODER && !s->decode)
2683         return;
2684
2685     if (is_complex)
2686         hl_decode_mb_complex(h);
2687     else hl_decode_mb_simple(h);
2688 }
2689
2690 static void pic_as_field(Picture *pic, const int parity){
2691     int i;
2692     for (i = 0; i < 4; ++i) {
2693         if (parity == PICT_BOTTOM_FIELD)
2694             pic->data[i] += pic->linesize[i];
2695         pic->reference = parity;
2696         pic->linesize[i] *= 2;
2697     }
2698     pic->poc= pic->field_poc[parity == PICT_BOTTOM_FIELD];
2699 }
2700
2701 static int split_field_copy(Picture *dest, Picture *src,
2702                             int parity, int id_add){
2703     int match = !!(src->reference & parity);
2704
2705     if (match) {
2706         *dest = *src;
2707         if(parity != PICT_FRAME){
2708             pic_as_field(dest, parity);
2709             dest->pic_id *= 2;
2710             dest->pic_id += id_add;
2711         }
2712     }
2713
2714     return match;
2715 }
2716
2717 static int build_def_list(Picture *def, Picture **in, int len, int is_long, int sel){
2718     int i[2]={0};
2719     int index=0;
2720
2721     while(i[0]<len || i[1]<len){
2722         while(i[0]<len && !(in[ i[0] ] && (in[ i[0] ]->reference & sel)))
2723             i[0]++;
2724         while(i[1]<len && !(in[ i[1] ] && (in[ i[1] ]->reference & (sel^3))))
2725             i[1]++;
2726         if(i[0] < len){
2727             in[ i[0] ]->pic_id= is_long ? i[0] : in[ i[0] ]->frame_num;
2728             split_field_copy(&def[index++], in[ i[0]++ ], sel  , 1);
2729         }
2730         if(i[1] < len){
2731             in[ i[1] ]->pic_id= is_long ? i[1] : in[ i[1] ]->frame_num;
2732             split_field_copy(&def[index++], in[ i[1]++ ], sel^3, 0);
2733         }
2734     }
2735
2736     return index;
2737 }
2738
2739 static int add_sorted(Picture **sorted, Picture **src, int len, int limit, int dir){
2740     int i, best_poc;
2741     int out_i= 0;
2742
2743     for(;;){
2744         best_poc= dir ? INT_MIN : INT_MAX;
2745
2746         for(i=0; i<len; i++){
2747             const int poc= src[i]->poc;
2748             if(((poc > limit) ^ dir) && ((poc < best_poc) ^ dir)){
2749                 best_poc= poc;
2750                 sorted[out_i]= src[i];
2751             }
2752         }
2753         if(best_poc == (dir ? INT_MIN : INT_MAX))
2754             break;
2755         limit= sorted[out_i++]->poc - dir;
2756     }
2757     return out_i;
2758 }
2759
2760 /**
2761  * fills the default_ref_list.
2762  */
2763 static int fill_default_ref_list(H264Context *h){
2764     MpegEncContext * const s = &h->s;
2765     int i, len;
2766
2767     if(h->slice_type_nos==FF_B_TYPE){
2768         Picture *sorted[32];
2769         int cur_poc, list;
2770         int lens[2];
2771
2772         if(FIELD_PICTURE)
2773             cur_poc= s->current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
2774         else
2775             cur_poc= s->current_picture_ptr->poc;
2776
2777         for(list= 0; list<2; list++){
2778             len= add_sorted(sorted    , h->short_ref, h->short_ref_count, cur_poc, 1^list);
2779             len+=add_sorted(sorted+len, h->short_ref, h->short_ref_count, cur_poc, 0^list);
2780             assert(len<=32);
2781             len= build_def_list(h->default_ref_list[list]    , sorted     , len, 0, s->picture_structure);
2782             len+=build_def_list(h->default_ref_list[list]+len, h->long_ref, 16 , 1, s->picture_structure);
2783             assert(len<=32);
2784
2785             if(len < h->ref_count[list])
2786                 memset(&h->default_ref_list[list][len], 0, sizeof(Picture)*(h->ref_count[list] - len));
2787             lens[list]= len;
2788         }
2789
2790         if(lens[0] == lens[1] && lens[1] > 1){
2791             for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0] && i<lens[0]; i++);
2792             if(i == lens[0])
2793                 FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2794         }
2795     }else{
2796         len = build_def_list(h->default_ref_list[0]    , h->short_ref, h->short_ref_count, 0, s->picture_structure);
2797         len+= build_def_list(h->default_ref_list[0]+len, h-> long_ref, 16                , 1, s->picture_structure);
2798         assert(len <= 32);
2799         if(len < h->ref_count[0])
2800             memset(&h->default_ref_list[0][len], 0, sizeof(Picture)*(h->ref_count[0] - len));
2801     }
2802 #ifdef TRACE
2803     for (i=0; i<h->ref_count[0]; i++) {
2804         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2805     }
2806     if(h->slice_type_nos==FF_B_TYPE){
2807         for (i=0; i<h->ref_count[1]; i++) {
2808             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
2809         }
2810     }
2811 #endif
2812     return 0;
2813 }
2814
2815 static void print_short_term(H264Context *h);
2816 static void print_long_term(H264Context *h);
2817
2818 /**
2819  * Extract structure information about the picture described by pic_num in
2820  * the current decoding context (frame or field). Note that pic_num is
2821  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
2822  * @param pic_num picture number for which to extract structure information
2823  * @param structure one of PICT_XXX describing structure of picture
2824  *                      with pic_num
2825  * @return frame number (short term) or long term index of picture
2826  *         described by pic_num
2827  */
2828 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
2829     MpegEncContext * const s = &h->s;
2830
2831     *structure = s->picture_structure;
2832     if(FIELD_PICTURE){
2833         if (!(pic_num & 1))
2834             /* opposite field */
2835             *structure ^= PICT_FRAME;
2836         pic_num >>= 1;
2837     }
2838
2839     return pic_num;
2840 }
2841
2842 static int decode_ref_pic_list_reordering(H264Context *h){
2843     MpegEncContext * const s = &h->s;
2844     int list, index, pic_structure;
2845
2846     print_short_term(h);
2847     print_long_term(h);
2848
2849     for(list=0; list<h->list_count; list++){
2850         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
2851
2852         if(get_bits1(&s->gb)){
2853             int pred= h->curr_pic_num;
2854
2855             for(index=0; ; index++){
2856                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb_31(&s->gb);
2857                 unsigned int pic_id;
2858                 int i;
2859                 Picture *ref = NULL;
2860
2861                 if(reordering_of_pic_nums_idc==3)
2862                     break;
2863
2864                 if(index >= h->ref_count[list]){
2865                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
2866                     return -1;
2867                 }
2868
2869                 if(reordering_of_pic_nums_idc<3){
2870                     if(reordering_of_pic_nums_idc<2){
2871                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
2872                         int frame_num;
2873
2874                         if(abs_diff_pic_num > h->max_pic_num){
2875                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
2876                             return -1;
2877                         }
2878
2879                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
2880                         else                                pred+= abs_diff_pic_num;
2881                         pred &= h->max_pic_num - 1;
2882
2883                         frame_num = pic_num_extract(h, pred, &pic_structure);
2884
2885                         for(i= h->short_ref_count-1; i>=0; i--){
2886                             ref = h->short_ref[i];
2887                             assert(ref->reference);
2888                             assert(!ref->long_ref);
2889                             if(
2890                                    ref->frame_num == frame_num &&
2891                                    (ref->reference & pic_structure)
2892                               )
2893                                 break;
2894                         }
2895                         if(i>=0)
2896                             ref->pic_id= pred;
2897                     }else{
2898                         int long_idx;
2899                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
2900
2901                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
2902
2903                         if(long_idx>31){
2904                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
2905                             return -1;
2906                         }
2907                         ref = h->long_ref[long_idx];
2908                         assert(!(ref && !ref->reference));
2909                         if(ref && (ref->reference & pic_structure)){
2910                             ref->pic_id= pic_id;
2911                             assert(ref->long_ref);
2912                             i=0;
2913                         }else{
2914                             i=-1;
2915                         }
2916                     }
2917
2918                     if (i < 0) {
2919                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
2920                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
2921                     } else {
2922                         for(i=index; i+1<h->ref_count[list]; i++){
2923                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
2924                                 break;
2925                         }
2926                         for(; i > index; i--){
2927                             h->ref_list[list][i]= h->ref_list[list][i-1];
2928                         }
2929                         h->ref_list[list][index]= *ref;
2930                         if (FIELD_PICTURE){
2931                             pic_as_field(&h->ref_list[list][index], pic_structure);
2932                         }
2933                     }
2934                 }else{
2935                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
2936                     return -1;
2937                 }
2938             }
2939         }
2940     }
2941     for(list=0; list<h->list_count; list++){
2942         for(index= 0; index < h->ref_count[list]; index++){
2943             if(!h->ref_list[list][index].data[0]){
2944                 av_log(h->s.avctx, AV_LOG_ERROR, "Missing reference picture\n");
2945                 h->ref_list[list][index]= s->current_picture; //FIXME this is not a sensible solution
2946             }
2947         }
2948     }
2949
2950     return 0;
2951 }
2952
2953 static void fill_mbaff_ref_list(H264Context *h){
2954     int list, i, j;
2955     for(list=0; list<2; list++){ //FIXME try list_count
2956         for(i=0; i<h->ref_count[list]; i++){
2957             Picture *frame = &h->ref_list[list][i];
2958             Picture *field = &h->ref_list[list][16+2*i];
2959             field[0] = *frame;
2960             for(j=0; j<3; j++)
2961                 field[0].linesize[j] <<= 1;
2962             field[0].reference = PICT_TOP_FIELD;
2963             field[0].poc= field[0].field_poc[0];
2964             field[1] = field[0];
2965             for(j=0; j<3; j++)
2966                 field[1].data[j] += frame->linesize[j];
2967             field[1].reference = PICT_BOTTOM_FIELD;
2968             field[1].poc= field[1].field_poc[1];
2969
2970             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
2971             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
2972             for(j=0; j<2; j++){
2973                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
2974                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
2975             }
2976         }
2977     }
2978     for(j=0; j<h->ref_count[1]; j++){
2979         for(i=0; i<h->ref_count[0]; i++)
2980             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
2981         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
2982         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
2983     }
2984 }
2985
2986 static int pred_weight_table(H264Context *h){
2987     MpegEncContext * const s = &h->s;
2988     int list, i;
2989     int luma_def, chroma_def;
2990
2991     h->use_weight= 0;
2992     h->use_weight_chroma= 0;
2993     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
2994     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
2995     luma_def = 1<<h->luma_log2_weight_denom;
2996     chroma_def = 1<<h->chroma_log2_weight_denom;
2997
2998     for(list=0; list<2; list++){
2999         for(i=0; i<h->ref_count[list]; i++){
3000             int luma_weight_flag, chroma_weight_flag;
3001
3002             luma_weight_flag= get_bits1(&s->gb);
3003             if(luma_weight_flag){
3004                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3005                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3006                 if(   h->luma_weight[list][i] != luma_def
3007                    || h->luma_offset[list][i] != 0)
3008                     h->use_weight= 1;
3009             }else{
3010                 h->luma_weight[list][i]= luma_def;
3011                 h->luma_offset[list][i]= 0;
3012             }
3013
3014             if(CHROMA){
3015                 chroma_weight_flag= get_bits1(&s->gb);
3016                 if(chroma_weight_flag){
3017                     int j;
3018                     for(j=0; j<2; j++){
3019                         h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3020                         h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3021                         if(   h->chroma_weight[list][i][j] != chroma_def
3022                         || h->chroma_offset[list][i][j] != 0)
3023                             h->use_weight_chroma= 1;
3024                     }
3025                 }else{
3026                     int j;
3027                     for(j=0; j<2; j++){
3028                         h->chroma_weight[list][i][j]= chroma_def;
3029                         h->chroma_offset[list][i][j]= 0;
3030                     }
3031                 }
3032             }
3033         }
3034         if(h->slice_type_nos != FF_B_TYPE) break;
3035     }
3036     h->use_weight= h->use_weight || h->use_weight_chroma;
3037     return 0;
3038 }
3039
3040 static void implicit_weight_table(H264Context *h){
3041     MpegEncContext * const s = &h->s;
3042     int ref0, ref1;
3043     int cur_poc = s->current_picture_ptr->poc;
3044
3045     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3046        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3047         h->use_weight= 0;
3048         h->use_weight_chroma= 0;
3049         return;
3050     }
3051
3052     h->use_weight= 2;
3053     h->use_weight_chroma= 2;
3054     h->luma_log2_weight_denom= 5;
3055     h->chroma_log2_weight_denom= 5;
3056
3057     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3058         int poc0 = h->ref_list[0][ref0].poc;
3059         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3060             int poc1 = h->ref_list[1][ref1].poc;
3061             int td = av_clip(poc1 - poc0, -128, 127);
3062             if(td){
3063                 int tb = av_clip(cur_poc - poc0, -128, 127);
3064                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3065                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3066                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3067                     h->implicit_weight[ref0][ref1] = 32;
3068                 else
3069                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3070             }else
3071                 h->implicit_weight[ref0][ref1] = 32;
3072         }
3073     }
3074 }
3075
3076 /**
3077  * Mark a picture as no longer needed for reference. The refmask
3078  * argument allows unreferencing of individual fields or the whole frame.
3079  * If the picture becomes entirely unreferenced, but is being held for
3080  * display purposes, it is marked as such.
3081  * @param refmask mask of fields to unreference; the mask is bitwise
3082  *                anded with the reference marking of pic
3083  * @return non-zero if pic becomes entirely unreferenced (except possibly
3084  *         for display purposes) zero if one of the fields remains in
3085  *         reference
3086  */
3087 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3088     int i;
3089     if (pic->reference &= refmask) {
3090         return 0;
3091     } else {
3092         for(i = 0; h->delayed_pic[i]; i++)
3093             if(pic == h->delayed_pic[i]){
3094                 pic->reference=DELAYED_PIC_REF;
3095                 break;
3096             }
3097         return 1;
3098     }
3099 }
3100
3101 /**
3102  * instantaneous decoder refresh.
3103  */
3104 static void idr(H264Context *h){
3105     int i;
3106
3107     for(i=0; i<16; i++){
3108         remove_long(h, i, 0);
3109     }
3110     assert(h->long_ref_count==0);
3111
3112     for(i=0; i<h->short_ref_count; i++){
3113         unreference_pic(h, h->short_ref[i], 0);
3114         h->short_ref[i]= NULL;
3115     }
3116     h->short_ref_count=0;
3117     h->prev_frame_num= 0;
3118     h->prev_frame_num_offset= 0;
3119     h->prev_poc_msb=
3120     h->prev_poc_lsb= 0;
3121 }
3122
3123 /* forget old pics after a seek */
3124 static void flush_dpb(AVCodecContext *avctx){
3125     H264Context *h= avctx->priv_data;
3126     int i;
3127     for(i=0; i<MAX_DELAYED_PIC_COUNT; i++) {
3128         if(h->delayed_pic[i])
3129             h->delayed_pic[i]->reference= 0;
3130         h->delayed_pic[i]= NULL;
3131     }
3132     h->outputed_poc= INT_MIN;
3133     idr(h);
3134     if(h->s.current_picture_ptr)
3135         h->s.current_picture_ptr->reference= 0;
3136     h->s.first_field= 0;
3137     ff_mpeg_flush(avctx);
3138 }
3139
3140 /**
3141  * Find a Picture in the short term reference list by frame number.
3142  * @param frame_num frame number to search for
3143  * @param idx the index into h->short_ref where returned picture is found
3144  *            undefined if no picture found.
3145  * @return pointer to the found picture, or NULL if no pic with the provided
3146  *                 frame number is found
3147  */
3148 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3149     MpegEncContext * const s = &h->s;
3150     int i;
3151
3152     for(i=0; i<h->short_ref_count; i++){
3153         Picture *pic= h->short_ref[i];
3154         if(s->avctx->debug&FF_DEBUG_MMCO)
3155             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3156         if(pic->frame_num == frame_num) {
3157             *idx = i;
3158             return pic;
3159         }
3160     }
3161     return NULL;
3162 }
3163
3164 /**
3165  * Remove a picture from the short term reference list by its index in
3166  * that list.  This does no checking on the provided index; it is assumed
3167  * to be valid. Other list entries are shifted down.
3168  * @param i index into h->short_ref of picture to remove.
3169  */
3170 static void remove_short_at_index(H264Context *h, int i){
3171     assert(i >= 0 && i < h->short_ref_count);
3172     h->short_ref[i]= NULL;
3173     if (--h->short_ref_count)
3174         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3175 }
3176
3177 /**
3178  *
3179  * @return the removed picture or NULL if an error occurs
3180  */
3181 static Picture * remove_short(H264Context *h, int frame_num, int ref_mask){
3182     MpegEncContext * const s = &h->s;
3183     Picture *pic;
3184     int i;
3185
3186     if(s->avctx->debug&FF_DEBUG_MMCO)
3187         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3188
3189     pic = find_short(h, frame_num, &i);
3190     if (pic){
3191         if(unreference_pic(h, pic, ref_mask))
3192         remove_short_at_index(h, i);
3193     }
3194
3195     return pic;
3196 }
3197
3198 /**
3199  * Remove a picture from the long term reference list by its index in
3200  * that list.
3201  * @return the removed picture or NULL if an error occurs
3202  */
3203 static Picture * remove_long(H264Context *h, int i, int ref_mask){
3204     Picture *pic;
3205
3206     pic= h->long_ref[i];
3207     if (pic){
3208         if(unreference_pic(h, pic, ref_mask)){
3209             assert(h->long_ref[i]->long_ref == 1);
3210             h->long_ref[i]->long_ref= 0;
3211             h->long_ref[i]= NULL;
3212             h->long_ref_count--;
3213         }
3214     }
3215
3216     return pic;
3217 }
3218
3219 /**
3220  * print short term list
3221  */
3222 static void print_short_term(H264Context *h) {
3223     uint32_t i;
3224     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3225         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3226         for(i=0; i<h->short_ref_count; i++){
3227             Picture *pic= h->short_ref[i];
3228             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3229         }
3230     }
3231 }
3232
3233 /**
3234  * print long term list
3235  */
3236 static void print_long_term(H264Context *h) {
3237     uint32_t i;
3238     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3239         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3240         for(i = 0; i < 16; i++){
3241             Picture *pic= h->long_ref[i];
3242             if (pic) {
3243                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3244             }
3245         }
3246     }
3247 }
3248
3249 /**
3250  * Executes the reference picture marking (memory management control operations).
3251  */
3252 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3253     MpegEncContext * const s = &h->s;
3254     int i, j;
3255     int current_ref_assigned=0;
3256     Picture *pic;
3257
3258     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3259         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3260
3261     for(i=0; i<mmco_count; i++){
3262         int structure, frame_num;
3263         if(s->avctx->debug&FF_DEBUG_MMCO)
3264             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3265
3266         if(   mmco[i].opcode == MMCO_SHORT2UNUSED
3267            || mmco[i].opcode == MMCO_SHORT2LONG){
3268             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3269             pic = find_short(h, frame_num, &j);
3270             if(!pic){
3271                 if(mmco[i].opcode != MMCO_SHORT2LONG || !h->long_ref[mmco[i].long_arg]
3272                    || h->long_ref[mmco[i].long_arg]->frame_num != frame_num)
3273                 av_log(h->s.avctx, AV_LOG_ERROR, "mmco: unref short failure\n");
3274                 continue;
3275             }
3276         }
3277
3278         switch(mmco[i].opcode){
3279         case MMCO_SHORT2UNUSED:
3280             if(s->avctx->debug&FF_DEBUG_MMCO)
3281                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3282             remove_short(h, frame_num, structure ^ PICT_FRAME);
3283             break;
3284         case MMCO_SHORT2LONG:
3285                 if (h->long_ref[mmco[i].long_arg] != pic)
3286                     remove_long(h, mmco[i].long_arg, 0);
3287
3288                 remove_short_at_index(h, j);
3289                 h->long_ref[ mmco[i].long_arg ]= pic;
3290                 if (h->long_ref[ mmco[i].long_arg ]){
3291                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3292                     h->long_ref_count++;
3293                 }
3294             break;
3295         case MMCO_LONG2UNUSED:
3296             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3297             pic = h->long_ref[j];
3298             if (pic) {
3299                 remove_long(h, j, structure ^ PICT_FRAME);
3300             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3301                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3302             break;
3303         case MMCO_LONG:
3304                     // Comment below left from previous code as it is an interresting note.
3305                     /* First field in pair is in short term list or
3306                      * at a different long term index.
3307                      * This is not allowed; see 7.4.3.3, notes 2 and 3.
3308                      * Report the problem and keep the pair where it is,
3309                      * and mark this field valid.
3310                      */
3311
3312             if (h->long_ref[mmco[i].long_arg] != s->current_picture_ptr) {
3313                 remove_long(h, mmco[i].long_arg, 0);
3314
3315                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3316                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3317                 h->long_ref_count++;
3318             }
3319
3320             s->current_picture_ptr->reference |= s->picture_structure;
3321             current_ref_assigned=1;
3322             break;
3323         case MMCO_SET_MAX_LONG:
3324             assert(mmco[i].long_arg <= 16);
3325             // just remove the long term which index is greater than new max
3326             for(j = mmco[i].long_arg; j<16; j++){
3327                 remove_long(h, j, 0);
3328             }
3329             break;
3330         case MMCO_RESET:
3331             while(h->short_ref_count){
3332                 remove_short(h, h->short_ref[0]->frame_num, 0);
3333             }
3334             for(j = 0; j < 16; j++) {
3335                 remove_long(h, j, 0);
3336             }
3337             s->current_picture_ptr->poc=
3338             s->current_picture_ptr->field_poc[0]=
3339             s->current_picture_ptr->field_poc[1]=
3340             h->poc_lsb=
3341             h->poc_msb=
3342             h->frame_num=
3343             s->current_picture_ptr->frame_num= 0;
3344             break;
3345         default: assert(0);
3346         }
3347     }
3348
3349     if (!current_ref_assigned) {
3350         /* Second field of complementary field pair; the first field of
3351          * which is already referenced. If short referenced, it
3352          * should be first entry in short_ref. If not, it must exist
3353          * in long_ref; trying to put it on the short list here is an
3354          * error in the encoded bit stream (ref: 7.4.3.3, NOTE 2 and 3).
3355          */
3356         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3357             /* Just mark the second field valid */
3358             s->current_picture_ptr->reference = PICT_FRAME;
3359         } else if (s->current_picture_ptr->long_ref) {
3360             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3361                                              "assignment for second field "
3362                                              "in complementary field pair "
3363                                              "(first field is long term)\n");
3364         } else {
3365             pic= remove_short(h, s->current_picture_ptr->frame_num, 0);
3366             if(pic){
3367                 av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3368             }
3369
3370             if(h->short_ref_count)
3371                 memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3372
3373             h->short_ref[0]= s->current_picture_ptr;
3374             h->short_ref_count++;
3375             s->current_picture_ptr->reference |= s->picture_structure;
3376         }
3377     }
3378
3379     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3380
3381         /* We have too many reference frames, probably due to corrupted
3382          * stream. Need to discard one frame. Prevents overrun of the
3383          * short_ref and long_ref buffers.
3384          */
3385         av_log(h->s.avctx, AV_LOG_ERROR,
3386                "number of reference frames exceeds max (probably "
3387                "corrupt input), discarding one\n");
3388
3389         if (h->long_ref_count && !h->short_ref_count) {
3390             for (i = 0; i < 16; ++i)
3391                 if (h->long_ref[i])
3392                     break;
3393
3394             assert(i < 16);
3395             remove_long(h, i, 0);
3396         } else {
3397             pic = h->short_ref[h->short_ref_count - 1];
3398             remove_short(h, pic->frame_num, 0);
3399         }
3400     }
3401
3402     print_short_term(h);
3403     print_long_term(h);
3404     return 0;
3405 }
3406
3407 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3408     MpegEncContext * const s = &h->s;
3409     int i;
3410
3411     h->mmco_index= 0;
3412     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3413         s->broken_link= get_bits1(gb) -1;
3414         if(get_bits1(gb)){
3415             h->mmco[0].opcode= MMCO_LONG;
3416             h->mmco[0].long_arg= 0;
3417             h->mmco_index= 1;
3418         }
3419     }else{
3420         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3421             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3422                 MMCOOpcode opcode= get_ue_golomb_31(gb);
3423
3424                 h->mmco[i].opcode= opcode;
3425                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3426                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3427 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3428                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3429                         return -1;
3430                     }*/
3431                 }
3432                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3433                     unsigned int long_arg= get_ue_golomb_31(gb);
3434                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3435                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3436                         return -1;
3437                     }
3438                     h->mmco[i].long_arg= long_arg;
3439                 }
3440
3441                 if(opcode > (unsigned)MMCO_LONG){
3442                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3443                     return -1;
3444                 }
3445                 if(opcode == MMCO_END)
3446                     break;
3447             }
3448             h->mmco_index= i;
3449         }else{
3450             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3451
3452             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3453                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3454                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3455                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3456                 h->mmco_index= 1;
3457                 if (FIELD_PICTURE) {
3458                     h->mmco[0].short_pic_num *= 2;
3459                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3460                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3461                     h->mmco_index= 2;
3462                 }
3463             }
3464         }
3465     }
3466
3467     return 0;
3468 }
3469
3470 static int init_poc(H264Context *h){
3471     MpegEncContext * const s = &h->s;
3472     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3473     int field_poc[2];
3474     Picture *cur = s->current_picture_ptr;
3475
3476     h->frame_num_offset= h->prev_frame_num_offset;
3477     if(h->frame_num < h->prev_frame_num)
3478         h->frame_num_offset += max_frame_num;
3479
3480     if(h->sps.poc_type==0){
3481         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3482
3483         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3484             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3485         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3486             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3487         else
3488             h->poc_msb = h->prev_poc_msb;
3489 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3490         field_poc[0] =
3491         field_poc[1] = h->poc_msb + h->poc_lsb;
3492         if(s->picture_structure == PICT_FRAME)
3493             field_poc[1] += h->delta_poc_bottom;
3494     }else if(h->sps.poc_type==1){
3495         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3496         int i;
3497
3498         if(h->sps.poc_cycle_length != 0)
3499             abs_frame_num = h->frame_num_offset + h->frame_num;
3500         else
3501             abs_frame_num = 0;
3502
3503         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3504             abs_frame_num--;
3505
3506         expected_delta_per_poc_cycle = 0;
3507         for(i=0; i < h->sps.poc_cycle_length; i++)
3508             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3509
3510         if(abs_frame_num > 0){
3511             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3512             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3513
3514             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3515             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3516                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3517         } else
3518             expectedpoc = 0;
3519
3520         if(h->nal_ref_idc == 0)
3521             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3522
3523         field_poc[0] = expectedpoc + h->delta_poc[0];
3524         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3525
3526         if(s->picture_structure == PICT_FRAME)
3527             field_poc[1] += h->delta_poc[1];
3528     }else{
3529         int poc= 2*(h->frame_num_offset + h->frame_num);
3530
3531         if(!h->nal_ref_idc)
3532             poc--;
3533
3534         field_poc[0]= poc;
3535         field_poc[1]= poc;
3536     }
3537
3538     if(s->picture_structure != PICT_BOTTOM_FIELD)
3539         s->current_picture_ptr->field_poc[0]= field_poc[0];
3540     if(s->picture_structure != PICT_TOP_FIELD)
3541         s->current_picture_ptr->field_poc[1]= field_poc[1];
3542     cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3543
3544     return 0;
3545 }
3546
3547
3548 /**
3549  * initialize scan tables
3550  */
3551 static void init_scan_tables(H264Context *h){
3552     MpegEncContext * const s = &h->s;
3553     int i;
3554     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3555         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3556         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3557     }else{
3558         for(i=0; i<16; i++){
3559 #define T(x) (x>>2) | ((x<<2) & 0xF)
3560             h->zigzag_scan[i] = T(zigzag_scan[i]);
3561             h-> field_scan[i] = T( field_scan[i]);
3562 #undef T
3563         }
3564     }
3565     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3566         memcpy(h->zigzag_scan8x8,       ff_zigzag_direct,     64*sizeof(uint8_t));
3567         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3568         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3569         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3570     }else{
3571         for(i=0; i<64; i++){
3572 #define T(x) (x>>3) | ((x&7)<<3)
3573             h->zigzag_scan8x8[i]       = T(ff_zigzag_direct[i]);
3574             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3575             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3576             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3577 #undef T
3578         }
3579     }
3580     if(h->sps.transform_bypass){ //FIXME same ugly
3581         h->zigzag_scan_q0          = zigzag_scan;
3582         h->zigzag_scan8x8_q0       = ff_zigzag_direct;
3583         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3584         h->field_scan_q0           = field_scan;
3585         h->field_scan8x8_q0        = field_scan8x8;
3586         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3587     }else{
3588         h->zigzag_scan_q0          = h->zigzag_scan;
3589         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3590         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3591         h->field_scan_q0           = h->field_scan;
3592         h->field_scan8x8_q0        = h->field_scan8x8;
3593         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3594     }
3595 }
3596
3597 /**
3598  * Replicates H264 "master" context to thread contexts.
3599  */
3600 static void clone_slice(H264Context *dst, H264Context *src)
3601 {
3602     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3603     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3604     dst->s.current_picture      = src->s.current_picture;
3605     dst->s.linesize             = src->s.linesize;
3606     dst->s.uvlinesize           = src->s.uvlinesize;
3607     dst->s.first_field          = src->s.first_field;
3608
3609     dst->prev_poc_msb           = src->prev_poc_msb;
3610     dst->prev_poc_lsb           = src->prev_poc_lsb;
3611     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3612     dst->prev_frame_num         = src->prev_frame_num;
3613     dst->short_ref_count        = src->short_ref_count;
3614
3615     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3616     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3617     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3618     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3619
3620     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3621     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3622 }
3623
3624 /**
3625  * decodes a slice header.
3626  * This will also call MPV_common_init() and frame_start() as needed.
3627  *
3628  * @param h h264context
3629  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3630  *
3631  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
3632  */
3633 static int decode_slice_header(H264Context *h, H264Context *h0){
3634     MpegEncContext * const s = &h->s;
3635     MpegEncContext * const s0 = &h0->s;
3636     unsigned int first_mb_in_slice;
3637     unsigned int pps_id;
3638     int num_ref_idx_active_override_flag;
3639     unsigned int slice_type, tmp, i, j;
3640     int default_ref_list_done = 0;
3641     int last_pic_structure;
3642
3643     s->dropable= h->nal_ref_idc == 0;
3644
3645     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3646         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3647         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3648     }else{
3649         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3650         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3651     }
3652
3653     first_mb_in_slice= get_ue_golomb(&s->gb);
3654
3655     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3656         h0->current_slice = 0;
3657         if (!s0->first_field)
3658             s->current_picture_ptr= NULL;
3659     }
3660
3661     slice_type= get_ue_golomb_31(&s->gb);
3662     if(slice_type > 9){
3663         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3664         return -1;
3665     }
3666     if(slice_type > 4){
3667         slice_type -= 5;
3668         h->slice_type_fixed=1;
3669     }else
3670         h->slice_type_fixed=0;
3671
3672     slice_type= golomb_to_pict_type[ slice_type ];
3673     if (slice_type == FF_I_TYPE
3674         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3675         default_ref_list_done = 1;
3676     }
3677     h->slice_type= slice_type;
3678     h->slice_type_nos= slice_type & 3;
3679
3680     s->pict_type= h->slice_type; // to make a few old functions happy, it's wrong though
3681     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3682         av_log(h->s.avctx, AV_LOG_ERROR,
3683                "B picture before any references, skipping\n");
3684         return -1;
3685     }
3686
3687     pps_id= get_ue_golomb(&s->gb);
3688     if(pps_id>=MAX_PPS_COUNT){
3689         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3690         return -1;
3691     }
3692     if(!h0->pps_buffers[pps_id]) {
3693         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing PPS referenced\n");
3694         return -1;
3695     }
3696     h->pps= *h0->pps_buffers[pps_id];
3697
3698     if(!h0->sps_buffers[h->pps.sps_id]) {
3699         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS referenced\n");
3700         return -1;
3701     }
3702     h->sps = *h0->sps_buffers[h->pps.sps_id];
3703
3704     if(h == h0 && h->dequant_coeff_pps != pps_id){
3705         h->dequant_coeff_pps = pps_id;
3706         init_dequant_tables(h);
3707     }
3708
3709     s->mb_width= h->sps.mb_width;
3710     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3711
3712     h->b_stride=  s->mb_width*4;
3713     h->b8_stride= s->mb_width*2;
3714
3715     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
3716     if(h->sps.frame_mbs_only_flag)
3717         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
3718     else
3719         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 3);
3720
3721     if (s->context_initialized
3722         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3723         if(h != h0)
3724             return -1;   // width / height changed during parallelized decoding
3725         free_tables(h);
3726         flush_dpb(s->avctx);
3727         MPV_common_end(s);
3728     }
3729     if (!s->context_initialized) {
3730         if(h != h0)
3731             return -1;  // we cant (re-)initialize context during parallel decoding
3732         if (MPV_common_init(s) < 0)
3733             return -1;
3734         s->first_field = 0;
3735
3736         init_scan_tables(h);
3737         alloc_tables(h);
3738
3739         for(i = 1; i < s->avctx->thread_count; i++) {
3740             H264Context *c;
3741             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3742             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3743             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3744             c->sps = h->sps;
3745             c->pps = h->pps;
3746             init_scan_tables(c);
3747             clone_tables(c, h);
3748         }
3749
3750         for(i = 0; i < s->avctx->thread_count; i++)
3751             if(context_init(h->thread_context[i]) < 0)
3752                 return -1;
3753
3754         s->avctx->width = s->width;
3755         s->avctx->height = s->height;
3756         s->avctx->sample_aspect_ratio= h->sps.sar;
3757         if(!s->avctx->sample_aspect_ratio.den)
3758             s->avctx->sample_aspect_ratio.den = 1;
3759
3760         if(h->sps.timing_info_present_flag){
3761             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
3762             if(h->x264_build > 0 && h->x264_build < 44)
3763                 s->avctx->time_base.den *= 2;
3764             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3765                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3766         }
3767     }
3768
3769     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3770
3771     h->mb_mbaff = 0;
3772     h->mb_aff_frame = 0;
3773     last_pic_structure = s0->picture_structure;
3774     if(h->sps.frame_mbs_only_flag){
3775         s->picture_structure= PICT_FRAME;
3776     }else{
3777         if(get_bits1(&s->gb)) { //field_pic_flag
3778             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3779         } else {
3780             s->picture_structure= PICT_FRAME;
3781             h->mb_aff_frame = h->sps.mb_aff;
3782         }
3783     }
3784     h->mb_field_decoding_flag= s->picture_structure != PICT_FRAME;
3785
3786     if(h0->current_slice == 0){
3787         while(h->frame_num !=  h->prev_frame_num &&
3788               h->frame_num != (h->prev_frame_num+1)%(1<<h->sps.log2_max_frame_num)){
3789             av_log(NULL, AV_LOG_DEBUG, "Frame num gap %d %d\n", h->frame_num, h->prev_frame_num);
3790             frame_start(h);
3791             h->prev_frame_num++;
3792             h->prev_frame_num %= 1<<h->sps.log2_max_frame_num;
3793             s->current_picture_ptr->frame_num= h->prev_frame_num;
3794             execute_ref_pic_marking(h, NULL, 0);
3795         }
3796
3797         /* See if we have a decoded first field looking for a pair... */
3798         if (s0->first_field) {
3799             assert(s0->current_picture_ptr);
3800             assert(s0->current_picture_ptr->data[0]);
3801             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
3802
3803             /* figure out if we have a complementary field pair */
3804             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
3805                 /*
3806                  * Previous field is unmatched. Don't display it, but let it
3807                  * remain for reference if marked as such.
3808                  */
3809                 s0->current_picture_ptr = NULL;
3810                 s0->first_field = FIELD_PICTURE;
3811
3812             } else {
3813                 if (h->nal_ref_idc &&
3814                         s0->current_picture_ptr->reference &&
3815                         s0->current_picture_ptr->frame_num != h->frame_num) {
3816                     /*
3817                      * This and previous field were reference, but had
3818                      * different frame_nums. Consider this field first in
3819                      * pair. Throw away previous field except for reference
3820                      * purposes.
3821                      */
3822                     s0->first_field = 1;
3823                     s0->current_picture_ptr = NULL;
3824
3825                 } else {
3826                     /* Second field in complementary pair */
3827                     s0->first_field = 0;
3828                 }
3829             }
3830
3831         } else {
3832             /* Frame or first field in a potentially complementary pair */
3833             assert(!s0->current_picture_ptr);
3834             s0->first_field = FIELD_PICTURE;
3835         }
3836
3837         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
3838             s0->first_field = 0;
3839             return -1;
3840         }
3841     }
3842     if(h != h0)
3843         clone_slice(h, h0);
3844
3845     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
3846
3847     assert(s->mb_num == s->mb_width * s->mb_height);
3848     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
3849        first_mb_in_slice                    >= s->mb_num){
3850         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
3851         return -1;
3852     }
3853     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
3854     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
3855     if (s->picture_structure == PICT_BOTTOM_FIELD)
3856         s->resync_mb_y = s->mb_y = s->mb_y + 1;
3857     assert(s->mb_y < s->mb_height);
3858
3859     if(s->picture_structure==PICT_FRAME){
3860         h->curr_pic_num=   h->frame_num;
3861         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
3862     }else{
3863         h->curr_pic_num= 2*h->frame_num + 1;
3864         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
3865     }
3866
3867     if(h->nal_unit_type == NAL_IDR_SLICE){
3868         get_ue_golomb(&s->gb); /* idr_pic_id */
3869     }
3870
3871     if(h->sps.poc_type==0){
3872         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
3873
3874         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
3875             h->delta_poc_bottom= get_se_golomb(&s->gb);
3876         }
3877     }
3878
3879     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
3880         h->delta_poc[0]= get_se_golomb(&s->gb);
3881
3882         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
3883             h->delta_poc[1]= get_se_golomb(&s->gb);
3884     }
3885
3886     init_poc(h);
3887
3888     if(h->pps.redundant_pic_cnt_present){
3889         h->redundant_pic_count= get_ue_golomb(&s->gb);
3890     }
3891
3892     //set defaults, might be overridden a few lines later
3893     h->ref_count[0]= h->pps.ref_count[0];
3894     h->ref_count[1]= h->pps.ref_count[1];
3895
3896     if(h->slice_type_nos != FF_I_TYPE){
3897         if(h->slice_type_nos == FF_B_TYPE){
3898             h->direct_spatial_mv_pred= get_bits1(&s->gb);
3899         }
3900         num_ref_idx_active_override_flag= get_bits1(&s->gb);
3901
3902         if(num_ref_idx_active_override_flag){
3903             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
3904             if(h->slice_type_nos==FF_B_TYPE)
3905                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
3906
3907             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
3908                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
3909                 h->ref_count[0]= h->ref_count[1]= 1;
3910                 return -1;
3911             }
3912         }
3913         if(h->slice_type_nos == FF_B_TYPE)
3914             h->list_count= 2;
3915         else
3916             h->list_count= 1;
3917     }else
3918         h->list_count= 0;
3919
3920     if(!default_ref_list_done){
3921         fill_default_ref_list(h);
3922     }
3923
3924     if(h->slice_type_nos!=FF_I_TYPE && decode_ref_pic_list_reordering(h) < 0)
3925         return -1;
3926
3927     if(h->slice_type_nos!=FF_I_TYPE){
3928         s->last_picture_ptr= &h->ref_list[0][0];
3929         ff_copy_picture(&s->last_picture, s->last_picture_ptr);
3930     }
3931     if(h->slice_type_nos==FF_B_TYPE){
3932         s->next_picture_ptr= &h->ref_list[1][0];
3933         ff_copy_picture(&s->next_picture, s->next_picture_ptr);
3934     }
3935
3936     if(   (h->pps.weighted_pred          && h->slice_type_nos == FF_P_TYPE )
3937        ||  (h->pps.weighted_bipred_idc==1 && h->slice_type_nos== FF_B_TYPE ) )
3938         pred_weight_table(h);
3939     else if(h->pps.weighted_bipred_idc==2 && h->slice_type_nos== FF_B_TYPE)
3940         implicit_weight_table(h);
3941     else
3942         h->use_weight = 0;
3943
3944     if(h->nal_ref_idc)
3945         decode_ref_pic_marking(h0, &s->gb);
3946
3947     if(FRAME_MBAFF)
3948         fill_mbaff_ref_list(h);
3949
3950     if(h->slice_type_nos==FF_B_TYPE && !h->direct_spatial_mv_pred)
3951         direct_dist_scale_factor(h);
3952     direct_ref_list_init(h);
3953
3954     if( h->slice_type_nos != FF_I_TYPE && h->pps.cabac ){
3955         tmp = get_ue_golomb_31(&s->gb);
3956         if(tmp > 2){
3957             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
3958             return -1;
3959         }
3960         h->cabac_init_idc= tmp;
3961     }
3962
3963     h->last_qscale_diff = 0;
3964     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
3965     if(tmp>51){
3966         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
3967         return -1;
3968     }
3969     s->qscale= tmp;
3970     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
3971     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
3972     //FIXME qscale / qp ... stuff
3973     if(h->slice_type == FF_SP_TYPE){
3974         get_bits1(&s->gb); /* sp_for_switch_flag */
3975     }
3976     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
3977         get_se_golomb(&s->gb); /* slice_qs_delta */
3978     }
3979
3980     h->deblocking_filter = 1;
3981     h->slice_alpha_c0_offset = 0;
3982     h->slice_beta_offset = 0;
3983     if( h->pps.deblocking_filter_parameters_present ) {
3984         tmp= get_ue_golomb_31(&s->gb);
3985         if(tmp > 2){
3986             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
3987             return -1;
3988         }
3989         h->deblocking_filter= tmp;
3990         if(h->deblocking_filter < 2)
3991             h->deblocking_filter^= 1; // 1<->0
3992
3993         if( h->deblocking_filter ) {
3994             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
3995             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
3996         }
3997     }
3998
3999     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4000        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type_nos != FF_I_TYPE)
4001        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type_nos == FF_B_TYPE)
4002        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4003         h->deblocking_filter= 0;
4004
4005     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
4006         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
4007             /* Cheat slightly for speed:
4008                Do not bother to deblock across slices. */
4009             h->deblocking_filter = 2;
4010         } else {
4011             h0->max_contexts = 1;
4012             if(!h0->single_decode_warning) {
4013                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
4014                 h0->single_decode_warning = 1;
4015             }
4016             if(h != h0)
4017                 return 1; // deblocking switched inside frame
4018         }
4019     }
4020
4021 #if 0 //FMO
4022     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4023         slice_group_change_cycle= get_bits(&s->gb, ?);
4024 #endif
4025
4026     h0->last_slice_type = slice_type;
4027     h->slice_num = ++h0->current_slice;
4028     if(h->slice_num >= MAX_SLICES){
4029         av_log(s->avctx, AV_LOG_ERROR, "Too many slices, increase MAX_SLICES and recompile\n");
4030     }
4031
4032     for(j=0; j<2; j++){
4033         int *ref2frm= h->ref2frm[h->slice_num&(MAX_SLICES-1)][j];
4034         ref2frm[0]=
4035         ref2frm[1]= -1;
4036         for(i=0; i<16; i++)
4037             ref2frm[i+2]= 4*h->ref_list[j][i].frame_num
4038                           +(h->ref_list[j][i].reference&3);
4039         ref2frm[18+0]=
4040         ref2frm[18+1]= -1;
4041         for(i=16; i<48; i++)
4042             ref2frm[i+4]= 4*h->ref_list[j][i].frame_num
4043                           +(h->ref_list[j][i].reference&3);
4044     }
4045
4046     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4047     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
4048
4049     s->avctx->refs= h->sps.ref_frame_count;
4050
4051     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4052         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c%s%s pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
4053                h->slice_num,
4054                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4055                first_mb_in_slice,
4056                av_get_pict_type_char(h->slice_type), h->slice_type_fixed ? " fix" : "", h->nal_unit_type == NAL_IDR_SLICE ? " IDR" : "",
4057                pps_id, h->frame_num,
4058                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4059                h->ref_count[0], h->ref_count[1],
4060                s->qscale,
4061                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4062                h->use_weight,
4063                h->use_weight==1 && h->use_weight_chroma ? "c" : "",
4064                h->slice_type == FF_B_TYPE ? (h->direct_spatial_mv_pred ? "SPAT" : "TEMP") : ""
4065                );
4066     }
4067
4068     return 0;
4069 }
4070
4071 /**
4072  *
4073  */
4074 static inline int get_level_prefix(GetBitContext *gb){
4075     unsigned int buf;
4076     int log;
4077
4078     OPEN_READER(re, gb);
4079     UPDATE_CACHE(re, gb);
4080     buf=GET_CACHE(re, gb);
4081
4082     log= 32 - av_log2(buf);
4083 #ifdef TRACE
4084     print_bin(buf>>(32-log), log);
4085     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4086 #endif
4087
4088     LAST_SKIP_BITS(re, gb, log);
4089     CLOSE_READER(re, gb);
4090
4091     return log-1;
4092 }
4093
4094 static inline int get_dct8x8_allowed(H264Context *h){
4095     if(h->sps.direct_8x8_inference_flag)
4096         return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8                )*0x0001000100010001ULL));
4097     else
4098         return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8|MB_TYPE_DIRECT2)*0x0001000100010001ULL));
4099 }
4100
4101 /**
4102  * decodes a residual block.
4103  * @param n block index
4104  * @param scantable scantable
4105  * @param max_coeff number of coefficients in the block
4106  * @return <0 if an error occurred
4107  */
4108 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4109     MpegEncContext * const s = &h->s;
4110     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4111     int level[16];
4112     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4113
4114     //FIXME put trailing_onex into the context
4115
4116     if(n == CHROMA_DC_BLOCK_INDEX){
4117         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4118         total_coeff= coeff_token>>2;
4119     }else{
4120         if(n == LUMA_DC_BLOCK_INDEX){
4121             total_coeff= pred_non_zero_count(h, 0);
4122             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4123             total_coeff= coeff_token>>2;
4124         }else{
4125             total_coeff= pred_non_zero_count(h, n);
4126             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4127             total_coeff= coeff_token>>2;
4128             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4129         }
4130     }
4131
4132     //FIXME set last_non_zero?
4133
4134     if(total_coeff==0)
4135         return 0;
4136     if(total_coeff > (unsigned)max_coeff) {
4137         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4138         return -1;
4139     }
4140
4141     trailing_ones= coeff_token&3;
4142     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4143     assert(total_coeff<=16);
4144
4145     i = show_bits(gb, 3);
4146     skip_bits(gb, trailing_ones);
4147     level[0] = 1-((i&4)>>1);
4148     level[1] = 1-((i&2)   );
4149     level[2] = 1-((i&1)<<1);
4150
4151     if(trailing_ones<total_coeff) {
4152         int mask, prefix;
4153         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4154         int bitsi= show_bits(gb, LEVEL_TAB_BITS);
4155         int level_code= cavlc_level_tab[suffix_length][bitsi][0];
4156
4157         skip_bits(gb, cavlc_level_tab[suffix_length][bitsi][1]);
4158         if(level_code >= 100){
4159             prefix= level_code - 100;
4160             if(prefix == LEVEL_TAB_BITS)
4161                 prefix += get_level_prefix(gb);
4162
4163             //first coefficient has suffix_length equal to 0 or 1
4164             if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4165                 if(suffix_length)
4166                     level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4167                 else
4168                     level_code= (prefix<<suffix_length); //part
4169             }else if(prefix==14){
4170                 if(suffix_length)
4171                     level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4172                 else
4173                     level_code= prefix + get_bits(gb, 4); //part
4174             }else{
4175                 level_code= (15<<suffix_length) + get_bits(gb, prefix-3); //part
4176                 if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4177                 if(prefix>=16)
4178                     level_code += (1<<(prefix-3))-4096;
4179             }
4180
4181             if(trailing_ones < 3) level_code += 2;
4182
4183             suffix_length = 2;
4184             mask= -(level_code&1);
4185             level[trailing_ones]= (((2+level_code)>>1) ^ mask) - mask;
4186         }else{
4187             if(trailing_ones < 3) level_code += (level_code>>31)|1;
4188
4189             suffix_length = 1;
4190             if(level_code + 3U > 6U)
4191                 suffix_length++;
4192             level[trailing_ones]= level_code;
4193         }
4194
4195         //remaining coefficients have suffix_length > 0
4196         for(i=trailing_ones+1;i<total_coeff;i++) {
4197             static const unsigned int suffix_limit[7] = {0,3,6,12,24,48,INT_MAX };
4198             int bitsi= show_bits(gb, LEVEL_TAB_BITS);
4199             level_code= cavlc_level_tab[suffix_length][bitsi][0];
4200
4201             skip_bits(gb, cavlc_level_tab[suffix_length][bitsi][1]);
4202             if(level_code >= 100){
4203                 prefix= level_code - 100;
4204                 if(prefix == LEVEL_TAB_BITS){
4205                     prefix += get_level_prefix(gb);
4206                 }
4207                 if(prefix<15){
4208                     level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4209                 }else{
4210                     level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
4211                     if(prefix>=16)
4212                         level_code += (1<<(prefix-3))-4096;
4213                 }
4214                 mask= -(level_code&1);
4215                 level_code= (((2+level_code)>>1) ^ mask) - mask;
4216             }
4217             level[i]= level_code;
4218
4219             if(suffix_limit[suffix_length] + level_code > 2U*suffix_limit[suffix_length])
4220                 suffix_length++;
4221         }
4222     }
4223
4224     if(total_coeff == max_coeff)
4225         zeros_left=0;
4226     else{
4227         if(n == CHROMA_DC_BLOCK_INDEX)
4228             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4229         else
4230             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4231     }
4232
4233     coeff_num = zeros_left + total_coeff - 1;
4234     j = scantable[coeff_num];
4235     if(n > 24){
4236         block[j] = level[0];
4237         for(i=1;i<total_coeff;i++) {
4238             if(zeros_left <= 0)
4239                 run_before = 0;
4240             else if(zeros_left < 7){
4241                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4242             }else{
4243                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4244             }
4245             zeros_left -= run_before;
4246             coeff_num -= 1 + run_before;
4247             j= scantable[ coeff_num ];
4248
4249             block[j]= level[i];
4250         }
4251     }else{
4252         block[j] = (level[0] * qmul[j] + 32)>>6;
4253         for(i=1;i<total_coeff;i++) {
4254             if(zeros_left <= 0)
4255                 run_before = 0;
4256             else if(zeros_left < 7){
4257                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4258             }else{
4259                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4260             }
4261             zeros_left -= run_before;
4262             coeff_num -= 1 + run_before;
4263             j= scantable[ coeff_num ];
4264
4265             block[j]= (level[i] * qmul[j] + 32)>>6;
4266         }
4267     }
4268
4269     if(zeros_left<0){
4270         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4271         return -1;
4272     }
4273
4274     return 0;
4275 }
4276
4277 static void predict_field_decoding_flag(H264Context *h){
4278     MpegEncContext * const s = &h->s;
4279     const int mb_xy= h->mb_xy;
4280     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4281                 ? s->current_picture.mb_type[mb_xy-1]
4282                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4283                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4284                 : 0;
4285     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4286 }
4287
4288 /**
4289  * decodes a P_SKIP or B_SKIP macroblock
4290  */
4291 static void decode_mb_skip(H264Context *h){
4292     MpegEncContext * const s = &h->s;
4293     const int mb_xy= h->mb_xy;
4294     int mb_type=0;
4295
4296     memset(h->non_zero_count[mb_xy], 0, 16);
4297     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4298
4299     if(MB_FIELD)
4300         mb_type|= MB_TYPE_INTERLACED;
4301
4302     if( h->slice_type_nos == FF_B_TYPE )
4303     {
4304         // just for fill_caches. pred_direct_motion will set the real mb_type
4305         mb_type|= MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4306
4307         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4308         pred_direct_motion(h, &mb_type);
4309         mb_type|= MB_TYPE_SKIP;
4310     }
4311     else
4312     {
4313         int mx, my;
4314         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4315
4316         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4317         pred_pskip_motion(h, &mx, &my);
4318         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4319         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4320     }
4321
4322     write_back_motion(h, mb_type);
4323     s->current_picture.mb_type[mb_xy]= mb_type;
4324     s->current_picture.qscale_table[mb_xy]= s->qscale;
4325     h->slice_table[ mb_xy ]= h->slice_num;
4326     h->prev_mb_skipped= 1;
4327 }
4328
4329 /**
4330  * decodes a macroblock
4331  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4332  */
4333 static int decode_mb_cavlc(H264Context *h){
4334     MpegEncContext * const s = &h->s;
4335     int mb_xy;
4336     int partition_count;
4337     unsigned int mb_type, cbp;
4338     int dct8x8_allowed= h->pps.transform_8x8_mode;
4339
4340     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4341
4342     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4343     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4344                 down the code */
4345     if(h->slice_type_nos != FF_I_TYPE){
4346         if(s->mb_skip_run==-1)
4347             s->mb_skip_run= get_ue_golomb(&s->gb);
4348
4349         if (s->mb_skip_run--) {
4350             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4351                 if(s->mb_skip_run==0)
4352                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4353                 else
4354                     predict_field_decoding_flag(h);
4355             }
4356             decode_mb_skip(h);
4357             return 0;
4358         }
4359     }
4360     if(FRAME_MBAFF){
4361         if( (s->mb_y&1) == 0 )
4362             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4363     }
4364
4365     h->prev_mb_skipped= 0;
4366
4367     mb_type= get_ue_golomb(&s->gb);
4368     if(h->slice_type_nos == FF_B_TYPE){
4369         if(mb_type < 23){
4370             partition_count= b_mb_type_info[mb_type].partition_count;
4371             mb_type=         b_mb_type_info[mb_type].type;
4372         }else{
4373             mb_type -= 23;
4374             goto decode_intra_mb;
4375         }
4376     }else if(h->slice_type_nos == FF_P_TYPE){
4377         if(mb_type < 5){
4378             partition_count= p_mb_type_info[mb_type].partition_count;
4379             mb_type=         p_mb_type_info[mb_type].type;
4380         }else{
4381             mb_type -= 5;
4382             goto decode_intra_mb;
4383         }
4384     }else{
4385        assert(h->slice_type_nos == FF_I_TYPE);
4386         if(h->slice_type == FF_SI_TYPE && mb_type)
4387             mb_type--;
4388 decode_intra_mb:
4389         if(mb_type > 25){
4390             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4391             return -1;
4392         }
4393         partition_count=0;
4394         cbp= i_mb_type_info[mb_type].cbp;
4395         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4396         mb_type= i_mb_type_info[mb_type].type;
4397     }
4398
4399     if(MB_FIELD)
4400         mb_type |= MB_TYPE_INTERLACED;
4401
4402     h->slice_table[ mb_xy ]= h->slice_num;
4403
4404     if(IS_INTRA_PCM(mb_type)){
4405         unsigned int x;
4406
4407         // We assume these blocks are very rare so we do not optimize it.
4408         align_get_bits(&s->gb);
4409
4410         // The pixels are stored in the same order as levels in h->mb array.
4411         for(x=0; x < (CHROMA ? 384 : 256); x++){
4412             ((uint8_t*)h->mb)[x]= get_bits(&s->gb, 8);
4413         }
4414
4415         // In deblocking, the quantizer is 0
4416         s->current_picture.qscale_table[mb_xy]= 0;
4417         // All coeffs are present
4418         memset(h->non_zero_count[mb_xy], 16, 16);
4419
4420         s->current_picture.mb_type[mb_xy]= mb_type;
4421         return 0;
4422     }
4423
4424     if(MB_MBAFF){
4425         h->ref_count[0] <<= 1;
4426         h->ref_count[1] <<= 1;
4427     }
4428
4429     fill_caches(h, mb_type, 0);
4430
4431     //mb_pred
4432     if(IS_INTRA(mb_type)){
4433         int pred_mode;
4434 //            init_top_left_availability(h);
4435         if(IS_INTRA4x4(mb_type)){
4436             int i;
4437             int di = 1;
4438             if(dct8x8_allowed && get_bits1(&s->gb)){
4439                 mb_type |= MB_TYPE_8x8DCT;
4440                 di = 4;
4441             }
4442
4443 //                fill_intra4x4_pred_table(h);
4444             for(i=0; i<16; i+=di){
4445                 int mode= pred_intra_mode(h, i);
4446
4447                 if(!get_bits1(&s->gb)){
4448                     const int rem_mode= get_bits(&s->gb, 3);
4449                     mode = rem_mode + (rem_mode >= mode);
4450                 }
4451
4452                 if(di==4)
4453                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4454                 else
4455                     h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4456             }
4457             write_back_intra_pred_mode(h);
4458             if( check_intra4x4_pred_mode(h) < 0)
4459                 return -1;
4460         }else{
4461             h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4462             if(h->intra16x16_pred_mode < 0)
4463                 return -1;
4464         }
4465         if(CHROMA){
4466             pred_mode= check_intra_pred_mode(h, get_ue_golomb_31(&s->gb));
4467             if(pred_mode < 0)
4468                 return -1;
4469             h->chroma_pred_mode= pred_mode;
4470         }
4471     }else if(partition_count==4){
4472         int i, j, sub_partition_count[4], list, ref[2][4];
4473
4474         if(h->slice_type_nos == FF_B_TYPE){
4475             for(i=0; i<4; i++){
4476                 h->sub_mb_type[i]= get_ue_golomb_31(&s->gb);
4477                 if(h->sub_mb_type[i] >=13){
4478                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4479                     return -1;
4480                 }
4481                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4482                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4483             }
4484             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4485                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4486                 pred_direct_motion(h, &mb_type);
4487                 h->ref_cache[0][scan8[4]] =
4488                 h->ref_cache[1][scan8[4]] =
4489                 h->ref_cache[0][scan8[12]] =
4490                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4491             }
4492         }else{
4493             assert(h->slice_type_nos == FF_P_TYPE); //FIXME SP correct ?
4494             for(i=0; i<4; i++){
4495                 h->sub_mb_type[i]= get_ue_golomb_31(&s->gb);
4496                 if(h->sub_mb_type[i] >=4){
4497                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4498                     return -1;
4499                 }
4500                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4501                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4502             }
4503         }
4504
4505         for(list=0; list<h->list_count; list++){
4506             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4507             for(i=0; i<4; i++){
4508                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4509                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4510                     unsigned int tmp;
4511                     if(ref_count == 1){
4512                         tmp= 0;
4513                     }else if(ref_count == 2){
4514                         tmp= get_bits1(&s->gb)^1;
4515                     }else{
4516                         tmp= get_ue_golomb_31(&s->gb);
4517                         if(tmp>=ref_count){
4518                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4519                             return -1;
4520                         }
4521                     }
4522                     ref[list][i]= tmp;
4523                 }else{
4524                  //FIXME
4525                     ref[list][i] = -1;
4526                 }
4527             }
4528         }
4529
4530         if(dct8x8_allowed)
4531             dct8x8_allowed = get_dct8x8_allowed(h);
4532
4533         for(list=0; list<h->list_count; list++){
4534             for(i=0; i<4; i++){
4535                 if(IS_DIRECT(h->sub_mb_type[i])) {
4536                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4537                     continue;
4538                 }
4539                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4540                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4541
4542                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4543                     const int sub_mb_type= h->sub_mb_type[i];
4544                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4545                     for(j=0; j<sub_partition_count[i]; j++){
4546                         int mx, my;
4547                         const int index= 4*i + block_width*j;
4548                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4549                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4550                         mx += get_se_golomb(&s->gb);
4551                         my += get_se_golomb(&s->gb);
4552                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4553
4554                         if(IS_SUB_8X8(sub_mb_type)){
4555                             mv_cache[ 1 ][0]=
4556                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4557                             mv_cache[ 1 ][1]=
4558                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4559                         }else if(IS_SUB_8X4(sub_mb_type)){
4560                             mv_cache[ 1 ][0]= mx;
4561                             mv_cache[ 1 ][1]= my;
4562                         }else if(IS_SUB_4X8(sub_mb_type)){
4563                             mv_cache[ 8 ][0]= mx;
4564                             mv_cache[ 8 ][1]= my;
4565                         }
4566                         mv_cache[ 0 ][0]= mx;
4567                         mv_cache[ 0 ][1]= my;
4568                     }
4569                 }else{
4570                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4571                     p[0] = p[1]=
4572                     p[8] = p[9]= 0;
4573                 }
4574             }
4575         }
4576     }else if(IS_DIRECT(mb_type)){
4577         pred_direct_motion(h, &mb_type);
4578         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4579     }else{
4580         int list, mx, my, i;
4581          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4582         if(IS_16X16(mb_type)){
4583             for(list=0; list<h->list_count; list++){
4584                     unsigned int val;
4585                     if(IS_DIR(mb_type, 0, list)){
4586                         if(h->ref_count[list]==1){
4587                             val= 0;
4588                         }else if(h->ref_count[list]==2){
4589                             val= get_bits1(&s->gb)^1;
4590                         }else{
4591                             val= get_ue_golomb_31(&s->gb);
4592                             if(val >= h->ref_count[list]){
4593                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4594                                 return -1;
4595                             }
4596                         }
4597                     }else
4598                         val= LIST_NOT_USED&0xFF;
4599                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4600             }
4601             for(list=0; list<h->list_count; list++){
4602                 unsigned int val;
4603                 if(IS_DIR(mb_type, 0, list)){
4604                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4605                     mx += get_se_golomb(&s->gb);
4606                     my += get_se_golomb(&s->gb);
4607                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4608
4609                     val= pack16to32(mx,my);
4610                 }else
4611                     val=0;
4612                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4613             }
4614         }
4615         else if(IS_16X8(mb_type)){
4616             for(list=0; list<h->list_count; list++){
4617                     for(i=0; i<2; i++){
4618                         unsigned int val;
4619                         if(IS_DIR(mb_type, i, list)){
4620                             if(h->ref_count[list] == 1){
4621                                 val= 0;
4622                             }else if(h->ref_count[list] == 2){
4623                                 val= get_bits1(&s->gb)^1;
4624                             }else{
4625                                 val= get_ue_golomb_31(&s->gb);
4626                                 if(val >= h->ref_count[list]){
4627                                     av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4628                                     return -1;
4629                                 }
4630                             }
4631                         }else
4632                             val= LIST_NOT_USED&0xFF;
4633                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4634                     }
4635             }
4636             for(list=0; list<h->list_count; list++){
4637                 for(i=0; i<2; i++){
4638                     unsigned int val;
4639                     if(IS_DIR(mb_type, i, list)){
4640                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4641                         mx += get_se_golomb(&s->gb);
4642                         my += get_se_golomb(&s->gb);
4643                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4644
4645                         val= pack16to32(mx,my);
4646                     }else
4647                         val=0;
4648                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4649                 }
4650             }
4651         }else{
4652             assert(IS_8X16(mb_type));
4653             for(list=0; list<h->list_count; list++){
4654                     for(i=0; i<2; i++){
4655                         unsigned int val;
4656                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4657                             if(h->ref_count[list]==1){
4658                                 val= 0;
4659                             }else if(h->ref_count[list]==2){
4660                                 val= get_bits1(&s->gb)^1;
4661                             }else{
4662                                 val= get_ue_golomb_31(&s->gb);
4663                                 if(val >= h->ref_count[list]){
4664                                     av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4665                                     return -1;
4666                                 }
4667                             }
4668                         }else
4669                             val= LIST_NOT_USED&0xFF;
4670                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4671                     }
4672             }
4673             for(list=0; list<h->list_count; list++){
4674                 for(i=0; i<2; i++){
4675                     unsigned int val;
4676                     if(IS_DIR(mb_type, i, list)){
4677                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4678                         mx += get_se_golomb(&s->gb);
4679                         my += get_se_golomb(&s->gb);
4680                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4681
4682                         val= pack16to32(mx,my);
4683                     }else
4684                         val=0;
4685                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4686                 }
4687             }
4688         }
4689     }
4690
4691     if(IS_INTER(mb_type))
4692         write_back_motion(h, mb_type);
4693
4694     if(!IS_INTRA16x16(mb_type)){
4695         cbp= get_ue_golomb(&s->gb);
4696         if(cbp > 47){
4697             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4698             return -1;
4699         }
4700
4701         if(CHROMA){
4702             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp[cbp];
4703             else                     cbp= golomb_to_inter_cbp   [cbp];
4704         }else{
4705             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp_gray[cbp];
4706             else                     cbp= golomb_to_inter_cbp_gray[cbp];
4707         }
4708     }
4709     h->cbp = cbp;
4710
4711     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4712         if(get_bits1(&s->gb)){
4713             mb_type |= MB_TYPE_8x8DCT;
4714             h->cbp_table[mb_xy]= cbp;
4715         }
4716     }
4717     s->current_picture.mb_type[mb_xy]= mb_type;
4718
4719     if(cbp || IS_INTRA16x16(mb_type)){
4720         int i8x8, i4x4, chroma_idx;
4721         int dquant;
4722         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4723         const uint8_t *scan, *scan8x8, *dc_scan;
4724
4725 //        fill_non_zero_count_cache(h);
4726
4727         if(IS_INTERLACED(mb_type)){
4728             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4729             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4730             dc_scan= luma_dc_field_scan;
4731         }else{
4732             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4733             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4734             dc_scan= luma_dc_zigzag_scan;
4735         }
4736
4737         dquant= get_se_golomb(&s->gb);
4738
4739         if( dquant > 25 || dquant < -26 ){
4740             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4741             return -1;
4742         }
4743
4744         s->qscale += dquant;
4745         if(((unsigned)s->qscale) > 51){
4746             if(s->qscale<0) s->qscale+= 52;
4747             else            s->qscale-= 52;
4748         }
4749
4750         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4751         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4752         if(IS_INTRA16x16(mb_type)){
4753             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4754                 return -1; //FIXME continue if partitioned and other return -1 too
4755             }
4756
4757             assert((cbp&15) == 0 || (cbp&15) == 15);
4758
4759             if(cbp&15){
4760                 for(i8x8=0; i8x8<4; i8x8++){
4761                     for(i4x4=0; i4x4<4; i4x4++){
4762                         const int index= i4x4 + 4*i8x8;
4763                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4764                             return -1;
4765                         }
4766                     }
4767                 }
4768             }else{
4769                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4770             }
4771         }else{
4772             for(i8x8=0; i8x8<4; i8x8++){
4773                 if(cbp & (1<<i8x8)){
4774                     if(IS_8x8DCT(mb_type)){
4775                         DCTELEM *buf = &h->mb[64*i8x8];
4776                         uint8_t *nnz;
4777                         for(i4x4=0; i4x4<4; i4x4++){
4778                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4779                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4780                                 return -1;
4781                         }
4782                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4783                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4784                     }else{
4785                         for(i4x4=0; i4x4<4; i4x4++){
4786                             const int index= i4x4 + 4*i8x8;
4787
4788                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4789                                 return -1;
4790                             }
4791                         }
4792                     }
4793                 }else{
4794                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4795                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4796                 }
4797             }
4798         }
4799
4800         if(cbp&0x30){
4801             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4802                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4803                     return -1;
4804                 }
4805         }
4806
4807         if(cbp&0x20){
4808             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4809                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4810                 for(i4x4=0; i4x4<4; i4x4++){
4811                     const int index= 16 + 4*chroma_idx + i4x4;
4812                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4813                         return -1;
4814                     }
4815                 }
4816             }
4817         }else{
4818             uint8_t * const nnz= &h->non_zero_count_cache[0];
4819             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4820             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4821         }
4822     }else{
4823         uint8_t * const nnz= &h->non_zero_count_cache[0];
4824         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4825         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4826         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4827     }
4828     s->current_picture.qscale_table[mb_xy]= s->qscale;
4829     write_back_non_zero_count(h);
4830
4831     if(MB_MBAFF){
4832         h->ref_count[0] >>= 1;
4833         h->ref_count[1] >>= 1;
4834     }
4835
4836     return 0;
4837 }
4838
4839 static int decode_cabac_field_decoding_flag(H264Context *h) {
4840     MpegEncContext * const s = &h->s;
4841     const int mb_x = s->mb_x;
4842     const int mb_y = s->mb_y & ~1;
4843     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4844     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4845
4846     unsigned int ctx = 0;
4847
4848     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
4849         ctx += 1;
4850     }
4851     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
4852         ctx += 1;
4853     }
4854
4855     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
4856 }
4857
4858 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4859     uint8_t *state= &h->cabac_state[ctx_base];
4860     int mb_type;
4861
4862     if(intra_slice){
4863         MpegEncContext * const s = &h->s;
4864         const int mba_xy = h->left_mb_xy[0];
4865         const int mbb_xy = h->top_mb_xy;
4866         int ctx=0;
4867         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
4868             ctx++;
4869         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
4870             ctx++;
4871         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
4872             return 0;   /* I4x4 */
4873         state += 2;
4874     }else{
4875         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
4876             return 0;   /* I4x4 */
4877     }
4878
4879     if( get_cabac_terminate( &h->cabac ) )
4880         return 25;  /* PCM */
4881
4882     mb_type = 1; /* I16x16 */
4883     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
4884     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
4885         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
4886     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
4887     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
4888     return mb_type;
4889 }
4890
4891 static int decode_cabac_mb_type_b( H264Context *h ) {
4892     MpegEncContext * const s = &h->s;
4893
4894         const int mba_xy = h->left_mb_xy[0];
4895         const int mbb_xy = h->top_mb_xy;
4896         int ctx = 0;
4897         int bits;
4898         assert(h->slice_type_nos == FF_B_TYPE);
4899
4900         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
4901             ctx++;
4902         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
4903             ctx++;
4904
4905         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
4906             return 0; /* B_Direct_16x16 */
4907
4908         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
4909             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
4910         }
4911
4912         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
4913         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
4914         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
4915         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4916         if( bits < 8 )
4917             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
4918         else if( bits == 13 ) {
4919             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
4920         } else if( bits == 14 )
4921             return 11; /* B_L1_L0_8x16 */
4922         else if( bits == 15 )
4923             return 22; /* B_8x8 */
4924
4925         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4926         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
4927 }
4928
4929 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
4930     MpegEncContext * const s = &h->s;
4931     int mba_xy, mbb_xy;
4932     int ctx = 0;
4933
4934     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
4935         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
4936         mba_xy = mb_xy - 1;
4937         if( (mb_y&1)
4938             && h->slice_table[mba_xy] == h->slice_num
4939             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
4940             mba_xy += s->mb_stride;
4941         if( MB_FIELD ){
4942             mbb_xy = mb_xy - s->mb_stride;
4943             if( !(mb_y&1)
4944                 && h->slice_table[mbb_xy] == h->slice_num
4945                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
4946                 mbb_xy -= s->mb_stride;
4947         }else
4948             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
4949     }else{
4950         int mb_xy = h->mb_xy;
4951         mba_xy = mb_xy - 1;
4952         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
4953     }
4954
4955     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
4956         ctx++;
4957     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
4958         ctx++;
4959
4960     if( h->slice_type_nos == FF_B_TYPE )
4961         ctx += 13;
4962     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
4963 }
4964
4965 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
4966     int mode = 0;
4967
4968     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
4969         return pred_mode;
4970
4971     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
4972     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
4973     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
4974
4975     if( mode >= pred_mode )
4976         return mode + 1;
4977     else
4978         return mode;
4979 }
4980
4981 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
4982     const int mba_xy = h->left_mb_xy[0];
4983     const int mbb_xy = h->top_mb_xy;
4984
4985     int ctx = 0;
4986
4987     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
4988     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
4989         ctx++;
4990
4991     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
4992         ctx++;
4993
4994     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
4995         return 0;
4996
4997     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4998         return 1;
4999     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5000         return 2;
5001     else
5002         return 3;
5003 }
5004
5005 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5006     int cbp_b, cbp_a, ctx, cbp = 0;
5007
5008     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
5009     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
5010
5011     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
5012     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
5013     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
5014     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
5015     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
5016     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
5017     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
5018     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
5019     return cbp;
5020 }
5021 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5022     int ctx;
5023     int cbp_a, cbp_b;
5024
5025     cbp_a = (h->left_cbp>>4)&0x03;
5026     cbp_b = (h-> top_cbp>>4)&0x03;
5027
5028     ctx = 0;
5029     if( cbp_a > 0 ) ctx++;
5030     if( cbp_b > 0 ) ctx += 2;
5031     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5032         return 0;
5033
5034     ctx = 4;
5035     if( cbp_a == 2 ) ctx++;
5036     if( cbp_b == 2 ) ctx += 2;
5037     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5038 }
5039 static int decode_cabac_mb_dqp( H264Context *h) {
5040     int   ctx= h->last_qscale_diff != 0;
5041     int   val = 0;
5042
5043     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5044         ctx= 2+(ctx>>1);
5045         val++;
5046         if(val > 102) //prevent infinite loop
5047             return INT_MIN;
5048     }
5049
5050     if( val&0x01 )
5051         return   (val + 1)>>1 ;
5052     else
5053         return -((val + 1)>>1);
5054 }
5055 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5056     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5057         return 0;   /* 8x8 */
5058     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5059         return 1;   /* 8x4 */
5060     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5061         return 2;   /* 4x8 */
5062     return 3;       /* 4x4 */
5063 }
5064 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5065     int type;
5066     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5067         return 0;   /* B_Direct_8x8 */
5068     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5069         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5070     type = 3;
5071     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5072         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5073             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5074         type += 4;
5075     }
5076     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5077     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5078     return type;
5079 }
5080
5081 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5082     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5083 }
5084
5085 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5086     int refa = h->ref_cache[list][scan8[n] - 1];
5087     int refb = h->ref_cache[list][scan8[n] - 8];
5088     int ref  = 0;
5089     int ctx  = 0;
5090
5091     if( h->slice_type_nos == FF_B_TYPE) {
5092         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5093             ctx++;
5094         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5095             ctx += 2;
5096     } else {
5097         if( refa > 0 )
5098             ctx++;
5099         if( refb > 0 )
5100             ctx += 2;
5101     }
5102
5103     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5104         ref++;
5105         ctx = (ctx>>2)+4;
5106         if(ref >= 32 /*h->ref_list[list]*/){
5107             return -1;
5108         }
5109     }
5110     return ref;
5111 }
5112
5113 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5114     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5115                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5116     int ctxbase = (l == 0) ? 40 : 47;
5117     int mvd;
5118     int ctx = (amvd>2) + (amvd>32);
5119
5120     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5121         return 0;
5122
5123     mvd= 1;
5124     ctx= 3;
5125     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5126         mvd++;
5127         if( ctx < 6 )
5128             ctx++;
5129     }
5130
5131     if( mvd >= 9 ) {
5132         int k = 3;
5133         while( get_cabac_bypass( &h->cabac ) ) {
5134             mvd += 1 << k;
5135             k++;
5136             if(k>24){
5137                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5138                 return INT_MIN;
5139             }
5140         }
5141         while( k-- ) {
5142             if( get_cabac_bypass( &h->cabac ) )
5143                 mvd += 1 << k;
5144         }
5145     }
5146     return get_cabac_bypass_sign( &h->cabac, -mvd );
5147 }
5148
5149 static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
5150     int nza, nzb;
5151     int ctx = 0;
5152
5153     if( is_dc ) {
5154         if( cat == 0 ) {
5155             nza = h->left_cbp&0x100;
5156             nzb = h-> top_cbp&0x100;
5157         } else {
5158             nza = (h->left_cbp>>(6+idx))&0x01;
5159             nzb = (h-> top_cbp>>(6+idx))&0x01;
5160         }
5161     } else {
5162         assert(cat == 1 || cat == 2 || cat == 4);
5163         nza = h->non_zero_count_cache[scan8[idx] - 1];
5164         nzb = h->non_zero_count_cache[scan8[idx] - 8];
5165     }
5166
5167     if( nza > 0 )
5168         ctx++;
5169
5170     if( nzb > 0 )
5171         ctx += 2;
5172
5173     return ctx + 4 * cat;
5174 }
5175
5176 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8[63]) = {
5177     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5178     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5179     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5180     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5181 };
5182
5183 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
5184     static const int significant_coeff_flag_offset[2][6] = {
5185       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5186       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5187     };
5188     static const int last_coeff_flag_offset[2][6] = {
5189       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5190       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5191     };
5192     static const int coeff_abs_level_m1_offset[6] = {
5193         227+0, 227+10, 227+20, 227+30, 227+39, 426
5194     };
5195     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5196       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5197         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5198         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5199        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5200       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5201         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5202         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5203         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5204     };
5205     /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
5206      * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
5207      * map node ctx => cabac ctx for level=1 */
5208     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
5209     /* map node ctx => cabac ctx for level>1 */
5210     static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
5211     static const uint8_t coeff_abs_level_transition[2][8] = {
5212     /* update node ctx after decoding a level=1 */
5213         { 1, 2, 3, 3, 4, 5, 6, 7 },
5214     /* update node ctx after decoding a level>1 */
5215         { 4, 4, 4, 4, 5, 6, 7, 7 }
5216     };
5217
5218     int index[64];
5219
5220     int av_unused last;
5221     int coeff_count = 0;
5222     int node_ctx = 0;
5223
5224     uint8_t *significant_coeff_ctx_base;
5225     uint8_t *last_coeff_ctx_base;
5226     uint8_t *abs_level_m1_ctx_base;
5227
5228 #if !ARCH_X86
5229 #define CABAC_ON_STACK
5230 #endif
5231 #ifdef CABAC_ON_STACK
5232 #define CC &cc
5233     CABACContext cc;
5234     cc.range     = h->cabac.range;
5235     cc.low       = h->cabac.low;
5236     cc.bytestream= h->cabac.bytestream;
5237 #else
5238 #define CC &h->cabac
5239 #endif
5240
5241
5242     /* cat: 0-> DC 16x16  n = 0
5243      *      1-> AC 16x16  n = luma4x4idx
5244      *      2-> Luma4x4   n = luma4x4idx
5245      *      3-> DC Chroma n = iCbCr
5246      *      4-> AC Chroma n = 16 + 4 * iCbCr + chroma4x4idx
5247      *      5-> Luma8x8   n = 4 * luma8x8idx
5248      */
5249
5250     /* read coded block flag */
5251     if( is_dc || cat != 5 ) {
5252         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
5253             if( !is_dc )
5254                 h->non_zero_count_cache[scan8[n]] = 0;
5255
5256 #ifdef CABAC_ON_STACK
5257             h->cabac.range     = cc.range     ;
5258             h->cabac.low       = cc.low       ;
5259             h->cabac.bytestream= cc.bytestream;
5260 #endif
5261             return;
5262         }
5263     }
5264
5265     significant_coeff_ctx_base = h->cabac_state
5266         + significant_coeff_flag_offset[MB_FIELD][cat];
5267     last_coeff_ctx_base = h->cabac_state
5268         + last_coeff_flag_offset[MB_FIELD][cat];
5269     abs_level_m1_ctx_base = h->cabac_state
5270         + coeff_abs_level_m1_offset[cat];
5271
5272     if( !is_dc && cat == 5 ) {
5273 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5274         for(last= 0; last < coefs; last++) { \
5275             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5276             if( get_cabac( CC, sig_ctx )) { \
5277                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5278                 index[coeff_count++] = last; \
5279                 if( get_cabac( CC, last_ctx ) ) { \
5280                     last= max_coeff; \
5281                     break; \
5282                 } \
5283             } \
5284         }\
5285         if( last == max_coeff -1 ) {\
5286             index[coeff_count++] = last;\
5287         }
5288         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5289 #if ARCH_X86 && HAVE_7REGS && HAVE_EBX_AVAILABLE && !defined(BROKEN_RELOCATIONS)
5290         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5291     } else {
5292         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5293 #else
5294         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5295     } else {
5296         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5297 #endif
5298     }
5299     assert(coeff_count > 0);
5300
5301     if( is_dc ) {
5302         if( cat == 0 )
5303             h->cbp_table[h->mb_xy] |= 0x100;
5304         else
5305             h->cbp_table[h->mb_xy] |= 0x40 << n;
5306     } else {
5307         if( cat == 5 )
5308             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5309         else {
5310             assert( cat == 1 || cat == 2 || cat == 4 );
5311             h->non_zero_count_cache[scan8[n]] = coeff_count;
5312         }
5313     }
5314
5315     do {
5316         uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
5317
5318         int j= scantable[index[--coeff_count]];
5319
5320         if( get_cabac( CC, ctx ) == 0 ) {
5321             node_ctx = coeff_abs_level_transition[0][node_ctx];
5322             if( is_dc ) {
5323                 block[j] = get_cabac_bypass_sign( CC, -1);
5324             }else{
5325                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5326             }
5327         } else {
5328             int coeff_abs = 2;
5329             ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
5330             node_ctx = coeff_abs_level_transition[1][node_ctx];
5331
5332             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5333                 coeff_abs++;
5334             }
5335
5336             if( coeff_abs >= 15 ) {
5337                 int j = 0;
5338                 while( get_cabac_bypass( CC ) ) {
5339                     j++;
5340                 }
5341
5342                 coeff_abs=1;
5343                 while( j-- ) {
5344                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5345                 }
5346                 coeff_abs+= 14;
5347             }
5348
5349             if( is_dc ) {
5350                 block[j] = get_cabac_bypass_sign( CC, -coeff_abs );
5351             }else{
5352                 block[j] = (get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32) >> 6;
5353             }
5354         }
5355     } while( coeff_count );
5356 #ifdef CABAC_ON_STACK
5357             h->cabac.range     = cc.range     ;
5358             h->cabac.low       = cc.low       ;
5359             h->cabac.bytestream= cc.bytestream;
5360 #endif
5361
5362 }
5363
5364 #if !CONFIG_SMALL
5365 static void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5366     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 1);
5367 }
5368
5369 static void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5370     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
5371 }
5372 #endif
5373
5374 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5375 #if CONFIG_SMALL
5376     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, cat == 0 || cat == 3);
5377 #else
5378     if( cat == 0 || cat == 3 ) decode_cabac_residual_dc(h, block, cat, n, scantable, qmul, max_coeff);
5379     else decode_cabac_residual_nondc(h, block, cat, n, scantable, qmul, max_coeff);
5380 #endif
5381 }
5382
5383 static inline void compute_mb_neighbors(H264Context *h)
5384 {
5385     MpegEncContext * const s = &h->s;
5386     const int mb_xy  = h->mb_xy;
5387     h->top_mb_xy     = mb_xy - s->mb_stride;
5388     h->left_mb_xy[0] = mb_xy - 1;
5389     if(FRAME_MBAFF){
5390         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5391         const int top_pair_xy      = pair_xy     - s->mb_stride;
5392         const int top_mb_field_flag  = IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5393         const int left_mb_field_flag = IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5394         const int curr_mb_field_flag = MB_FIELD;
5395         const int bottom = (s->mb_y & 1);
5396
5397         if (curr_mb_field_flag && (bottom || top_mb_field_flag)){
5398             h->top_mb_xy -= s->mb_stride;
5399         }
5400         if (!left_mb_field_flag == curr_mb_field_flag) {
5401             h->left_mb_xy[0] = pair_xy - 1;
5402         }
5403     } else if (FIELD_PICTURE) {
5404         h->top_mb_xy -= s->mb_stride;
5405     }
5406     return;
5407 }
5408
5409 /**
5410  * decodes a macroblock
5411  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5412  */
5413 static int decode_mb_cabac(H264Context *h) {
5414     MpegEncContext * const s = &h->s;
5415     int mb_xy;
5416     int mb_type, partition_count, cbp = 0;
5417     int dct8x8_allowed= h->pps.transform_8x8_mode;
5418
5419     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5420
5421     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5422     if( h->slice_type_nos != FF_I_TYPE ) {
5423         int skip;
5424         /* a skipped mb needs the aff flag from the following mb */
5425         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5426             predict_field_decoding_flag(h);
5427         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5428             skip = h->next_mb_skipped;
5429         else
5430             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5431         /* read skip flags */
5432         if( skip ) {
5433             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5434                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5435                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5436                 if(!h->next_mb_skipped)
5437                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5438             }
5439
5440             decode_mb_skip(h);
5441
5442             h->cbp_table[mb_xy] = 0;
5443             h->chroma_pred_mode_table[mb_xy] = 0;
5444             h->last_qscale_diff = 0;
5445
5446             return 0;
5447
5448         }
5449     }
5450     if(FRAME_MBAFF){
5451         if( (s->mb_y&1) == 0 )
5452             h->mb_mbaff =
5453             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5454     }
5455
5456     h->prev_mb_skipped = 0;
5457
5458     compute_mb_neighbors(h);
5459
5460     if( h->slice_type_nos == FF_B_TYPE ) {
5461         mb_type = decode_cabac_mb_type_b( h );
5462         if( mb_type < 23 ){
5463             partition_count= b_mb_type_info[mb_type].partition_count;
5464             mb_type=         b_mb_type_info[mb_type].type;
5465         }else{
5466             mb_type -= 23;
5467             goto decode_intra_mb;
5468         }
5469     } else if( h->slice_type_nos == FF_P_TYPE ) {
5470         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
5471             /* P-type */
5472             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
5473                 /* P_L0_D16x16, P_8x8 */
5474                 mb_type= 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
5475             } else {
5476                 /* P_L0_D8x16, P_L0_D16x8 */
5477                 mb_type= 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
5478             }
5479             partition_count= p_mb_type_info[mb_type].partition_count;
5480             mb_type=         p_mb_type_info[mb_type].type;
5481         } else {
5482             mb_type= decode_cabac_intra_mb_type(h, 17, 0);
5483             goto decode_intra_mb;
5484         }
5485     } else {
5486         mb_type= decode_cabac_intra_mb_type(h, 3, 1);
5487         if(h->slice_type == FF_SI_TYPE && mb_type)
5488             mb_type--;
5489         assert(h->slice_type_nos == FF_I_TYPE);
5490 decode_intra_mb:
5491         partition_count = 0;
5492         cbp= i_mb_type_info[mb_type].cbp;
5493         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5494         mb_type= i_mb_type_info[mb_type].type;
5495     }
5496     if(MB_FIELD)
5497         mb_type |= MB_TYPE_INTERLACED;
5498
5499     h->slice_table[ mb_xy ]= h->slice_num;
5500
5501     if(IS_INTRA_PCM(mb_type)) {
5502         const uint8_t *ptr;
5503
5504         // We assume these blocks are very rare so we do not optimize it.
5505         // FIXME The two following lines get the bitstream position in the cabac
5506         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5507         ptr= h->cabac.bytestream;
5508         if(h->cabac.low&0x1) ptr--;
5509         if(CABAC_BITS==16){
5510             if(h->cabac.low&0x1FF) ptr--;
5511         }
5512
5513         // The pixels are stored in the same order as levels in h->mb array.
5514         memcpy(h->mb, ptr, 256); ptr+=256;
5515         if(CHROMA){
5516             memcpy(h->mb+128, ptr, 128); ptr+=128;
5517         }
5518
5519         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5520
5521         // All blocks are present
5522         h->cbp_table[mb_xy] = 0x1ef;
5523         h->chroma_pred_mode_table[mb_xy] = 0;
5524         // In deblocking, the quantizer is 0
5525         s->current_picture.qscale_table[mb_xy]= 0;
5526         // All coeffs are present
5527         memset(h->non_zero_count[mb_xy], 16, 16);
5528         s->current_picture.mb_type[mb_xy]= mb_type;
5529         h->last_qscale_diff = 0;
5530         return 0;
5531     }
5532
5533     if(MB_MBAFF){
5534         h->ref_count[0] <<= 1;
5535         h->ref_count[1] <<= 1;
5536     }
5537
5538     fill_caches(h, mb_type, 0);
5539
5540     if( IS_INTRA( mb_type ) ) {
5541         int i, pred_mode;
5542         if( IS_INTRA4x4( mb_type ) ) {
5543             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5544                 mb_type |= MB_TYPE_8x8DCT;
5545                 for( i = 0; i < 16; i+=4 ) {
5546                     int pred = pred_intra_mode( h, i );
5547                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5548                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5549                 }
5550             } else {
5551                 for( i = 0; i < 16; i++ ) {
5552                     int pred = pred_intra_mode( h, i );
5553                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5554
5555                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5556                 }
5557             }
5558             write_back_intra_pred_mode(h);
5559             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5560         } else {
5561             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5562             if( h->intra16x16_pred_mode < 0 ) return -1;
5563         }
5564         if(CHROMA){
5565             h->chroma_pred_mode_table[mb_xy] =
5566             pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5567
5568             pred_mode= check_intra_pred_mode( h, pred_mode );
5569             if( pred_mode < 0 ) return -1;
5570             h->chroma_pred_mode= pred_mode;
5571         }
5572     } else if( partition_count == 4 ) {
5573         int i, j, sub_partition_count[4], list, ref[2][4];
5574
5575         if( h->slice_type_nos == FF_B_TYPE ) {
5576             for( i = 0; i < 4; i++ ) {
5577                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5578                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5579                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5580             }
5581             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5582                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5583                 pred_direct_motion(h, &mb_type);
5584                 h->ref_cache[0][scan8[4]] =
5585                 h->ref_cache[1][scan8[4]] =
5586                 h->ref_cache[0][scan8[12]] =
5587                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5588                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5589                     for( i = 0; i < 4; i++ )
5590                         if( IS_DIRECT(h->sub_mb_type[i]) )
5591                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5592                 }
5593             }
5594         } else {
5595             for( i = 0; i < 4; i++ ) {
5596                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5597                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5598                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5599             }
5600         }
5601
5602         for( list = 0; list < h->list_count; list++ ) {
5603                 for( i = 0; i < 4; i++ ) {
5604                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5605                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5606                         if( h->ref_count[list] > 1 ){
5607                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5608                             if(ref[list][i] >= (unsigned)h->ref_count[list]){
5609                                 av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref[list][i], h->ref_count[list]);
5610                                 return -1;
5611                             }
5612                         }else
5613                             ref[list][i] = 0;
5614                     } else {
5615                         ref[list][i] = -1;
5616                     }
5617                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5618                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5619                 }
5620         }
5621
5622         if(dct8x8_allowed)
5623             dct8x8_allowed = get_dct8x8_allowed(h);
5624
5625         for(list=0; list<h->list_count; list++){
5626             for(i=0; i<4; i++){
5627                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5628                 if(IS_DIRECT(h->sub_mb_type[i])){
5629                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5630                     continue;
5631                 }
5632
5633                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5634                     const int sub_mb_type= h->sub_mb_type[i];
5635                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5636                     for(j=0; j<sub_partition_count[i]; j++){
5637                         int mpx, mpy;
5638                         int mx, my;
5639                         const int index= 4*i + block_width*j;
5640                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5641                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5642                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5643
5644                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5645                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5646                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5647
5648                         if(IS_SUB_8X8(sub_mb_type)){
5649                             mv_cache[ 1 ][0]=
5650                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5651                             mv_cache[ 1 ][1]=
5652                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5653
5654                             mvd_cache[ 1 ][0]=
5655                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5656                             mvd_cache[ 1 ][1]=
5657                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5658                         }else if(IS_SUB_8X4(sub_mb_type)){
5659                             mv_cache[ 1 ][0]= mx;
5660                             mv_cache[ 1 ][1]= my;
5661
5662                             mvd_cache[ 1 ][0]= mx - mpx;
5663                             mvd_cache[ 1 ][1]= my - mpy;
5664                         }else if(IS_SUB_4X8(sub_mb_type)){
5665                             mv_cache[ 8 ][0]= mx;
5666                             mv_cache[ 8 ][1]= my;
5667
5668                             mvd_cache[ 8 ][0]= mx - mpx;
5669                             mvd_cache[ 8 ][1]= my - mpy;
5670                         }
5671                         mv_cache[ 0 ][0]= mx;
5672                         mv_cache[ 0 ][1]= my;
5673
5674                         mvd_cache[ 0 ][0]= mx - mpx;
5675                         mvd_cache[ 0 ][1]= my - mpy;
5676                     }
5677                 }else{
5678                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5679                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5680                     p[0] = p[1] = p[8] = p[9] = 0;
5681                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5682                 }
5683             }
5684         }
5685     } else if( IS_DIRECT(mb_type) ) {
5686         pred_direct_motion(h, &mb_type);
5687         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5688         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5689         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5690     } else {
5691         int list, mx, my, i, mpx, mpy;
5692         if(IS_16X16(mb_type)){
5693             for(list=0; list<h->list_count; list++){
5694                 if(IS_DIR(mb_type, 0, list)){
5695                     int ref;
5696                     if(h->ref_count[list] > 1){
5697                         ref= decode_cabac_mb_ref(h, list, 0);
5698                         if(ref >= (unsigned)h->ref_count[list]){
5699                             av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5700                             return -1;
5701                         }
5702                     }else
5703                         ref=0;
5704                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5705                 }else
5706                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5707             }
5708             for(list=0; list<h->list_count; list++){
5709                 if(IS_DIR(mb_type, 0, list)){
5710                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5711
5712                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5713                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5714                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5715
5716                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5717                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5718                 }else
5719                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5720             }
5721         }
5722         else if(IS_16X8(mb_type)){
5723             for(list=0; list<h->list_count; list++){
5724                     for(i=0; i<2; i++){
5725                         if(IS_DIR(mb_type, i, list)){
5726                             int ref;
5727                             if(h->ref_count[list] > 1){
5728                                 ref= decode_cabac_mb_ref( h, list, 8*i );
5729                                 if(ref >= (unsigned)h->ref_count[list]){
5730                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5731                                     return -1;
5732                                 }
5733                             }else
5734                                 ref=0;
5735                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5736                         }else
5737                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5738                     }
5739             }
5740             for(list=0; list<h->list_count; list++){
5741                 for(i=0; i<2; i++){
5742                     if(IS_DIR(mb_type, i, list)){
5743                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5744                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5745                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5746                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5747
5748                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5749                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5750                     }else{
5751                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5752                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5753                     }
5754                 }
5755             }
5756         }else{
5757             assert(IS_8X16(mb_type));
5758             for(list=0; list<h->list_count; list++){
5759                     for(i=0; i<2; i++){
5760                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5761                             int ref;
5762                             if(h->ref_count[list] > 1){
5763                                 ref= decode_cabac_mb_ref( h, list, 4*i );
5764                                 if(ref >= (unsigned)h->ref_count[list]){
5765                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5766                                     return -1;
5767                                 }
5768                             }else
5769                                 ref=0;
5770                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5771                         }else
5772                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5773                     }
5774             }
5775             for(list=0; list<h->list_count; list++){
5776                 for(i=0; i<2; i++){
5777                     if(IS_DIR(mb_type, i, list)){
5778                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5779                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5780                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5781
5782                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5783                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5784                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5785                     }else{
5786                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5787                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5788                     }
5789                 }
5790             }
5791         }
5792     }
5793
5794    if( IS_INTER( mb_type ) ) {
5795         h->chroma_pred_mode_table[mb_xy] = 0;
5796         write_back_motion( h, mb_type );
5797    }
5798
5799     if( !IS_INTRA16x16( mb_type ) ) {
5800         cbp  = decode_cabac_mb_cbp_luma( h );
5801         if(CHROMA)
5802             cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5803     }
5804
5805     h->cbp_table[mb_xy] = h->cbp = cbp;
5806
5807     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5808         if( decode_cabac_mb_transform_size( h ) )
5809             mb_type |= MB_TYPE_8x8DCT;
5810     }
5811     s->current_picture.mb_type[mb_xy]= mb_type;
5812
5813     if( cbp || IS_INTRA16x16( mb_type ) ) {
5814         const uint8_t *scan, *scan8x8, *dc_scan;
5815         const uint32_t *qmul;
5816         int dqp;
5817
5818         if(IS_INTERLACED(mb_type)){
5819             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5820             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5821             dc_scan= luma_dc_field_scan;
5822         }else{
5823             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5824             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5825             dc_scan= luma_dc_zigzag_scan;
5826         }
5827
5828         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5829         if( dqp == INT_MIN ){
5830             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5831             return -1;
5832         }
5833         s->qscale += dqp;
5834         if(((unsigned)s->qscale) > 51){
5835             if(s->qscale<0) s->qscale+= 52;
5836             else            s->qscale-= 52;
5837         }
5838         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5839         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5840
5841         if( IS_INTRA16x16( mb_type ) ) {
5842             int i;
5843             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5844             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5845
5846             if( cbp&15 ) {
5847                 qmul = h->dequant4_coeff[0][s->qscale];
5848                 for( i = 0; i < 16; i++ ) {
5849                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5850                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
5851                 }
5852             } else {
5853                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5854             }
5855         } else {
5856             int i8x8, i4x4;
5857             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5858                 if( cbp & (1<<i8x8) ) {
5859                     if( IS_8x8DCT(mb_type) ) {
5860                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
5861                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
5862                     } else {
5863                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
5864                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
5865                             const int index = 4*i8x8 + i4x4;
5866                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
5867 //START_TIMER
5868                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
5869 //STOP_TIMER("decode_residual")
5870                         }
5871                     }
5872                 } else {
5873                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5874                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5875                 }
5876             }
5877         }
5878
5879         if( cbp&0x30 ){
5880             int c;
5881             for( c = 0; c < 2; c++ ) {
5882                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
5883                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
5884             }
5885         }
5886
5887         if( cbp&0x20 ) {
5888             int c, i;
5889             for( c = 0; c < 2; c++ ) {
5890                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
5891                 for( i = 0; i < 4; i++ ) {
5892                     const int index = 16 + 4 * c + i;
5893                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
5894                     decode_cabac_residual(h, h->mb + 16*index, 4, index, scan + 1, qmul, 15);
5895                 }
5896             }
5897         } else {
5898             uint8_t * const nnz= &h->non_zero_count_cache[0];
5899             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5900             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5901         }
5902     } else {
5903         uint8_t * const nnz= &h->non_zero_count_cache[0];
5904         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5905         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5906         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5907         h->last_qscale_diff = 0;
5908     }
5909
5910     s->current_picture.qscale_table[mb_xy]= s->qscale;
5911     write_back_non_zero_count(h);
5912
5913     if(MB_MBAFF){
5914         h->ref_count[0] >>= 1;
5915         h->ref_count[1] >>= 1;
5916     }
5917
5918     return 0;
5919 }
5920
5921
5922 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5923     const int index_a = qp + h->slice_alpha_c0_offset;
5924     const int alpha = (alpha_table+52)[index_a];
5925     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5926
5927     if( bS[0] < 4 ) {
5928         int8_t tc[4];
5929         tc[0] = (tc0_table+52)[index_a][bS[0]];
5930         tc[1] = (tc0_table+52)[index_a][bS[1]];
5931         tc[2] = (tc0_table+52)[index_a][bS[2]];
5932         tc[3] = (tc0_table+52)[index_a][bS[3]];
5933         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
5934     } else {
5935         h->s.dsp.h264_h_loop_filter_luma_intra(pix, stride, alpha, beta);
5936     }
5937 }
5938 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5939     const int index_a = qp + h->slice_alpha_c0_offset;
5940     const int alpha = (alpha_table+52)[index_a];
5941     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5942
5943     if( bS[0] < 4 ) {
5944         int8_t tc[4];
5945         tc[0] = (tc0_table+52)[index_a][bS[0]]+1;
5946         tc[1] = (tc0_table+52)[index_a][bS[1]]+1;
5947         tc[2] = (tc0_table+52)[index_a][bS[2]]+1;
5948         tc[3] = (tc0_table+52)[index_a][bS[3]]+1;
5949         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
5950     } else {
5951         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
5952     }
5953 }
5954
5955 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
5956     int i;
5957     for( i = 0; i < 16; i++, pix += stride) {
5958         int index_a;
5959         int alpha;
5960         int beta;
5961
5962         int qp_index;
5963         int bS_index = (i >> 1);
5964         if (!MB_FIELD) {
5965             bS_index &= ~1;
5966             bS_index |= (i & 1);
5967         }
5968
5969         if( bS[bS_index] == 0 ) {
5970             continue;
5971         }
5972
5973         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
5974         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
5975         alpha = (alpha_table+52)[index_a];
5976         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
5977
5978         if( bS[bS_index] < 4 ) {
5979             const int tc0 = (tc0_table+52)[index_a][bS[bS_index]];
5980             const int p0 = pix[-1];
5981             const int p1 = pix[-2];
5982             const int p2 = pix[-3];
5983             const int q0 = pix[0];
5984             const int q1 = pix[1];
5985             const int q2 = pix[2];
5986
5987             if( FFABS( p0 - q0 ) < alpha &&
5988                 FFABS( p1 - p0 ) < beta &&
5989                 FFABS( q1 - q0 ) < beta ) {
5990                 int tc = tc0;
5991                 int i_delta;
5992
5993                 if( FFABS( p2 - p0 ) < beta ) {
5994                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
5995                     tc++;
5996                 }
5997                 if( FFABS( q2 - q0 ) < beta ) {
5998                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
5999                     tc++;
6000                 }
6001
6002                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6003                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6004                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6005                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6006             }
6007         }else{
6008             const int p0 = pix[-1];
6009             const int p1 = pix[-2];
6010             const int p2 = pix[-3];
6011
6012             const int q0 = pix[0];
6013             const int q1 = pix[1];
6014             const int q2 = pix[2];
6015
6016             if( FFABS( p0 - q0 ) < alpha &&
6017                 FFABS( p1 - p0 ) < beta &&
6018                 FFABS( q1 - q0 ) < beta ) {
6019
6020                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6021                     if( FFABS( p2 - p0 ) < beta)
6022                     {
6023                         const int p3 = pix[-4];
6024                         /* p0', p1', p2' */
6025                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6026                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6027                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6028                     } else {
6029                         /* p0' */
6030                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6031                     }
6032                     if( FFABS( q2 - q0 ) < beta)
6033                     {
6034                         const int q3 = pix[3];
6035                         /* q0', q1', q2' */
6036                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6037                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6038                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6039                     } else {
6040                         /* q0' */
6041                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6042                     }
6043                 }else{
6044                     /* p0', q0' */
6045                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6046                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6047                 }
6048                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6049             }
6050         }
6051     }
6052 }
6053 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6054     int i;
6055     for( i = 0; i < 8; i++, pix += stride) {
6056         int index_a;
6057         int alpha;
6058         int beta;
6059
6060         int qp_index;
6061         int bS_index = i;
6062
6063         if( bS[bS_index] == 0 ) {
6064             continue;
6065         }
6066
6067         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6068         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6069         alpha = (alpha_table+52)[index_a];
6070         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6071
6072         if( bS[bS_index] < 4 ) {
6073             const int tc = (tc0_table+52)[index_a][bS[bS_index]] + 1;
6074             const int p0 = pix[-1];
6075             const int p1 = pix[-2];
6076             const int q0 = pix[0];
6077             const int q1 = pix[1];
6078
6079             if( FFABS( p0 - q0 ) < alpha &&
6080                 FFABS( p1 - p0 ) < beta &&
6081                 FFABS( q1 - q0 ) < beta ) {
6082                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6083
6084                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6085                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6086                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6087             }
6088         }else{
6089             const int p0 = pix[-1];
6090             const int p1 = pix[-2];
6091             const int q0 = pix[0];
6092             const int q1 = pix[1];
6093
6094             if( FFABS( p0 - q0 ) < alpha &&
6095                 FFABS( p1 - p0 ) < beta &&
6096                 FFABS( q1 - q0 ) < beta ) {
6097
6098                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6099                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6100                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6101             }
6102         }
6103     }
6104 }
6105
6106 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6107     const int index_a = qp + h->slice_alpha_c0_offset;
6108     const int alpha = (alpha_table+52)[index_a];
6109     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6110
6111     if( bS[0] < 4 ) {
6112         int8_t tc[4];
6113         tc[0] = (tc0_table+52)[index_a][bS[0]];
6114         tc[1] = (tc0_table+52)[index_a][bS[1]];
6115         tc[2] = (tc0_table+52)[index_a][bS[2]];
6116         tc[3] = (tc0_table+52)[index_a][bS[3]];
6117         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6118     } else {
6119         h->s.dsp.h264_v_loop_filter_luma_intra(pix, stride, alpha, beta);
6120     }
6121 }
6122
6123 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6124     const int index_a = qp + h->slice_alpha_c0_offset;
6125     const int alpha = (alpha_table+52)[index_a];
6126     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6127
6128     if( bS[0] < 4 ) {
6129         int8_t tc[4];
6130         tc[0] = (tc0_table+52)[index_a][bS[0]]+1;
6131         tc[1] = (tc0_table+52)[index_a][bS[1]]+1;
6132         tc[2] = (tc0_table+52)[index_a][bS[2]]+1;
6133         tc[3] = (tc0_table+52)[index_a][bS[3]]+1;
6134         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6135     } else {
6136         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6137     }
6138 }
6139
6140 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6141     MpegEncContext * const s = &h->s;
6142     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6143     int mb_xy, mb_type;
6144     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6145
6146     mb_xy = h->mb_xy;
6147
6148     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6149         !(s->flags2 & CODEC_FLAG2_FAST) || //FIXME filter_mb_fast is broken, thus hasto be, but should not under CODEC_FLAG2_FAST
6150        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6151                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6152         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6153         return;
6154     }
6155     assert(!FRAME_MBAFF);
6156
6157     mb_type = s->current_picture.mb_type[mb_xy];
6158     qp = s->current_picture.qscale_table[mb_xy];
6159     qp0 = s->current_picture.qscale_table[mb_xy-1];
6160     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6161     qpc = get_chroma_qp( h, 0, qp );
6162     qpc0 = get_chroma_qp( h, 0, qp0 );
6163     qpc1 = get_chroma_qp( h, 0, qp1 );
6164     qp0 = (qp + qp0 + 1) >> 1;
6165     qp1 = (qp + qp1 + 1) >> 1;
6166     qpc0 = (qpc + qpc0 + 1) >> 1;
6167     qpc1 = (qpc + qpc1 + 1) >> 1;
6168     qp_thresh = 15 - h->slice_alpha_c0_offset;
6169     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6170        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6171         return;
6172
6173     if( IS_INTRA(mb_type) ) {
6174         int16_t bS4[4] = {4,4,4,4};
6175         int16_t bS3[4] = {3,3,3,3};
6176         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6177         if( IS_8x8DCT(mb_type) ) {
6178             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6179             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6180             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6181             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6182         } else {
6183             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6184             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6185             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6186             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6187             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6188             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6189             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6190             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6191         }
6192         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6193         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6194         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6195         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6196         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6197         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6198         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6199         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6200         return;
6201     } else {
6202         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6203         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6204         int edges;
6205         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6206             edges = 4;
6207             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6208         } else {
6209             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6210                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6211             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6212                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6213                              ? 3 : 0;
6214             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6215             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6216             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6217                                               (h->slice_type_nos == FF_B_TYPE), edges, step, mask_edge0, mask_edge1, FIELD_PICTURE);
6218         }
6219         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6220             bSv[0][0] = 0x0004000400040004ULL;
6221         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6222             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6223
6224 #define FILTER(hv,dir,edge)\
6225         if(bSv[dir][edge]) {\
6226             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6227             if(!(edge&1)) {\
6228                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6229                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6230             }\
6231         }
6232         if( edges == 1 ) {
6233             FILTER(v,0,0);
6234             FILTER(h,1,0);
6235         } else if( IS_8x8DCT(mb_type) ) {
6236             FILTER(v,0,0);
6237             FILTER(v,0,2);
6238             FILTER(h,1,0);
6239             FILTER(h,1,2);
6240         } else {
6241             FILTER(v,0,0);
6242             FILTER(v,0,1);
6243             FILTER(v,0,2);
6244             FILTER(v,0,3);
6245             FILTER(h,1,0);
6246             FILTER(h,1,1);
6247             FILTER(h,1,2);
6248             FILTER(h,1,3);
6249         }
6250 #undef FILTER
6251     }
6252 }
6253
6254
6255 static void av_always_inline filter_mb_dir(H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int mb_xy, int mb_type, int mvy_limit, int first_vertical_edge_done, int dir) {
6256     MpegEncContext * const s = &h->s;
6257     int edge;
6258     const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6259     const int mbm_type = s->current_picture.mb_type[mbm_xy];
6260     int (*ref2frm) [64] = h->ref2frm[ h->slice_num          &(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6261     int (*ref2frmm)[64] = h->ref2frm[ h->slice_table[mbm_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6262     int start = h->slice_table[mbm_xy] == 0xFFFF ? 1 : 0;
6263
6264     const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6265                               == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6266     // how often to recheck mv-based bS when iterating between edges
6267     const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6268                           (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6269     // how often to recheck mv-based bS when iterating along each edge
6270     const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6271
6272     if (first_vertical_edge_done) {
6273         start = 1;
6274     }
6275
6276     if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6277         start = 1;
6278
6279     if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6280         && !IS_INTERLACED(mb_type)
6281         && IS_INTERLACED(mbm_type)
6282         ) {
6283         // This is a special case in the norm where the filtering must
6284         // be done twice (one each of the field) even if we are in a
6285         // frame macroblock.
6286         //
6287         static const int nnz_idx[4] = {4,5,6,3};
6288         unsigned int tmp_linesize   = 2 *   linesize;
6289         unsigned int tmp_uvlinesize = 2 * uvlinesize;
6290         int mbn_xy = mb_xy - 2 * s->mb_stride;
6291         int qp;
6292         int i, j;
6293         int16_t bS[4];
6294
6295         for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6296             if( IS_INTRA(mb_type) ||
6297                 IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6298                 bS[0] = bS[1] = bS[2] = bS[3] = 3;
6299             } else {
6300                 const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6301                 for( i = 0; i < 4; i++ ) {
6302                     if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6303                         mbn_nnz[nnz_idx[i]] != 0 )
6304                         bS[i] = 2;
6305                     else
6306                         bS[i] = 1;
6307                 }
6308             }
6309             // Do not use s->qscale as luma quantizer because it has not the same
6310             // value in IPCM macroblocks.
6311             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6312             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6313             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6314             filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6315             filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6316                               ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6317             filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6318                               ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6319         }
6320
6321         start = 1;
6322     }
6323
6324     /* Calculate bS */
6325     for( edge = start; edge < edges; edge++ ) {
6326         /* mbn_xy: neighbor macroblock */
6327         const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6328         const int mbn_type = s->current_picture.mb_type[mbn_xy];
6329         int (*ref2frmn)[64] = edge > 0 ? ref2frm : ref2frmm;
6330         int16_t bS[4];
6331         int qp;
6332
6333         if( (edge&1) && IS_8x8DCT(mb_type) )
6334             continue;
6335
6336         if( IS_INTRA(mb_type) ||
6337             IS_INTRA(mbn_type) ) {
6338             int value;
6339             if (edge == 0) {
6340                 if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6341                     || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6342                 ) {
6343                     value = 4;
6344                 } else {
6345                     value = 3;
6346                 }
6347             } else {
6348                 value = 3;
6349             }
6350             bS[0] = bS[1] = bS[2] = bS[3] = value;
6351         } else {
6352             int i, l;
6353             int mv_done;
6354
6355             if( edge & mask_edge ) {
6356                 bS[0] = bS[1] = bS[2] = bS[3] = 0;
6357                 mv_done = 1;
6358             }
6359             else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6360                 bS[0] = bS[1] = bS[2] = bS[3] = 1;
6361                 mv_done = 1;
6362             }
6363             else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6364                 int b_idx= 8 + 4 + edge * (dir ? 8:1);
6365                 int bn_idx= b_idx - (dir ? 8:1);
6366                 int v = 0;
6367
6368                 for( l = 0; !v && l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6369                     v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6370                          FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6371                          FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6372                 }
6373
6374                 if(h->slice_type_nos == FF_B_TYPE && v){
6375                     v=0;
6376                     for( l = 0; !v && l < 2; l++ ) {
6377                         int ln= 1-l;
6378                         v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6379                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6380                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit;
6381                     }
6382                 }
6383
6384                 bS[0] = bS[1] = bS[2] = bS[3] = v;
6385                 mv_done = 1;
6386             }
6387             else
6388                 mv_done = 0;
6389
6390             for( i = 0; i < 4; i++ ) {
6391                 int x = dir == 0 ? edge : i;
6392                 int y = dir == 0 ? i    : edge;
6393                 int b_idx= 8 + 4 + x + 8*y;
6394                 int bn_idx= b_idx - (dir ? 8:1);
6395
6396                 if( h->non_zero_count_cache[b_idx] |
6397                     h->non_zero_count_cache[bn_idx] ) {
6398                     bS[i] = 2;
6399                 }
6400                 else if(!mv_done)
6401                 {
6402                     bS[i] = 0;
6403                     for( l = 0; l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6404                         if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6405                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6406                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6407                             bS[i] = 1;
6408                             break;
6409                         }
6410                     }
6411
6412                     if(h->slice_type_nos == FF_B_TYPE && bS[i]){
6413                         bS[i] = 0;
6414                         for( l = 0; l < 2; l++ ) {
6415                             int ln= 1-l;
6416                             if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6417                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6418                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) {
6419                                 bS[i] = 1;
6420                                 break;
6421                             }
6422                         }
6423                     }
6424                 }
6425             }
6426
6427             if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6428                 continue;
6429         }
6430
6431         /* Filter edge */
6432         // Do not use s->qscale as luma quantizer because it has not the same
6433         // value in IPCM macroblocks.
6434         qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6435         //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6436         tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6437         { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6438         if( dir == 0 ) {
6439             filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6440             if( (edge&1) == 0 ) {
6441                 filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6442                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6443                 filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6444                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6445             }
6446         } else {
6447             filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6448             if( (edge&1) == 0 ) {
6449                 filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6450                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6451                 filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6452                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6453             }
6454         }
6455     }
6456 }
6457
6458 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6459     MpegEncContext * const s = &h->s;
6460     const int mb_xy= mb_x + mb_y*s->mb_stride;
6461     const int mb_type = s->current_picture.mb_type[mb_xy];
6462     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6463     int first_vertical_edge_done = 0;
6464     int dir;
6465
6466     //for sufficiently low qp, filtering wouldn't do anything
6467     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6468     if(!FRAME_MBAFF){
6469         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
6470         int qp = s->current_picture.qscale_table[mb_xy];
6471         if(qp <= qp_thresh
6472            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6473            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6474             return;
6475         }
6476     }
6477
6478     // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
6479     if(!h->pps.cabac && h->pps.transform_8x8_mode){
6480         int top_type, left_type[2];
6481         top_type     = s->current_picture.mb_type[h->top_mb_xy]    ;
6482         left_type[0] = s->current_picture.mb_type[h->left_mb_xy[0]];
6483         left_type[1] = s->current_picture.mb_type[h->left_mb_xy[1]];
6484
6485         if(IS_8x8DCT(top_type)){
6486             h->non_zero_count_cache[4+8*0]=
6487             h->non_zero_count_cache[5+8*0]= h->cbp_table[h->top_mb_xy] & 4;
6488             h->non_zero_count_cache[6+8*0]=
6489             h->non_zero_count_cache[7+8*0]= h->cbp_table[h->top_mb_xy] & 8;
6490         }
6491         if(IS_8x8DCT(left_type[0])){
6492             h->non_zero_count_cache[3+8*1]=
6493             h->non_zero_count_cache[3+8*2]= h->cbp_table[h->left_mb_xy[0]]&2; //FIXME check MBAFF
6494         }
6495         if(IS_8x8DCT(left_type[1])){
6496             h->non_zero_count_cache[3+8*3]=
6497             h->non_zero_count_cache[3+8*4]= h->cbp_table[h->left_mb_xy[1]]&8; //FIXME check MBAFF
6498         }
6499
6500         if(IS_8x8DCT(mb_type)){
6501             h->non_zero_count_cache[scan8[0   ]]= h->non_zero_count_cache[scan8[1   ]]=
6502             h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp & 1;
6503
6504             h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
6505             h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp & 2;
6506
6507             h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
6508             h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp & 4;
6509
6510             h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
6511             h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp & 8;
6512         }
6513     }
6514
6515     if (FRAME_MBAFF
6516             // left mb is in picture
6517             && h->slice_table[mb_xy-1] != 0xFFFF
6518             // and current and left pair do not have the same interlaced type
6519             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6520             // and left mb is in the same slice if deblocking_filter == 2
6521             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6522         /* First vertical edge is different in MBAFF frames
6523          * There are 8 different bS to compute and 2 different Qp
6524          */
6525         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6526         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6527         int16_t bS[8];
6528         int qp[2];
6529         int bqp[2];
6530         int rqp[2];
6531         int mb_qp, mbn0_qp, mbn1_qp;
6532         int i;
6533         first_vertical_edge_done = 1;
6534
6535         if( IS_INTRA(mb_type) )
6536             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6537         else {
6538             for( i = 0; i < 8; i++ ) {
6539                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6540
6541                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6542                     bS[i] = 4;
6543                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6544                          ((!h->pps.cabac && IS_8x8DCT(s->current_picture.mb_type[mbn_xy])) ?
6545                             (h->cbp_table[mbn_xy] & ((MB_FIELD ? (i&2) : (mb_y&1)) ? 8 : 2))
6546                                                                        :
6547                             h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2]))
6548                     bS[i] = 2;
6549                 else
6550                     bS[i] = 1;
6551             }
6552         }
6553
6554         mb_qp = s->current_picture.qscale_table[mb_xy];
6555         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6556         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6557         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6558         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6559                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6560         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6561                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6562         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6563         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6564                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6565         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6566                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6567
6568         /* Filter edge */
6569         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6570         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6571         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6572         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6573         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6574     }
6575
6576 #if CONFIG_SMALL
6577     for( dir = 0; dir < 2; dir++ )
6578         filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, dir ? 0 : first_vertical_edge_done, dir);
6579 #else
6580     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, first_vertical_edge_done, 0);
6581     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, 0, 1);
6582 #endif
6583 }
6584
6585 static int decode_slice(struct AVCodecContext *avctx, void *arg){
6586     H264Context *h = *(void**)arg;
6587     MpegEncContext * const s = &h->s;
6588     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6589
6590     s->mb_skip_run= -1;
6591
6592     h->is_complex = FRAME_MBAFF || s->picture_structure != PICT_FRAME || s->codec_id != CODEC_ID_H264 ||
6593                     (CONFIG_GRAY && (s->flags&CODEC_FLAG_GRAY)) || (CONFIG_H264_ENCODER && s->encoding);
6594
6595     if( h->pps.cabac ) {
6596         int i;
6597
6598         /* realign */
6599         align_get_bits( &s->gb );
6600
6601         /* init cabac */
6602         ff_init_cabac_states( &h->cabac);
6603         ff_init_cabac_decoder( &h->cabac,
6604                                s->gb.buffer + get_bits_count(&s->gb)/8,
6605                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6606         /* calculate pre-state */
6607         for( i= 0; i < 460; i++ ) {
6608             int pre;
6609             if( h->slice_type_nos == FF_I_TYPE )
6610                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6611             else
6612                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6613
6614             if( pre <= 63 )
6615                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6616             else
6617                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6618         }
6619
6620         for(;;){
6621 //START_TIMER
6622             int ret = decode_mb_cabac(h);
6623             int eos;
6624 //STOP_TIMER("decode_mb_cabac")
6625
6626             if(ret>=0) hl_decode_mb(h);
6627
6628             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6629                 s->mb_y++;
6630
6631                 ret = decode_mb_cabac(h);
6632
6633                 if(ret>=0) hl_decode_mb(h);
6634                 s->mb_y--;
6635             }
6636             eos = get_cabac_terminate( &h->cabac );
6637
6638             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6639                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6640                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6641                 return -1;
6642             }
6643
6644             if( ++s->mb_x >= s->mb_width ) {
6645                 s->mb_x = 0;
6646                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6647                 ++s->mb_y;
6648                 if(FIELD_OR_MBAFF_PICTURE) {
6649                     ++s->mb_y;
6650                 }
6651             }
6652
6653             if( eos || s->mb_y >= s->mb_height ) {
6654                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6655                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6656                 return 0;
6657             }
6658         }
6659
6660     } else {
6661         for(;;){
6662             int ret = decode_mb_cavlc(h);
6663
6664             if(ret>=0) hl_decode_mb(h);
6665
6666             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6667                 s->mb_y++;
6668                 ret = decode_mb_cavlc(h);
6669
6670                 if(ret>=0) hl_decode_mb(h);
6671                 s->mb_y--;
6672             }
6673
6674             if(ret<0){
6675                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6676                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6677
6678                 return -1;
6679             }
6680
6681             if(++s->mb_x >= s->mb_width){
6682                 s->mb_x=0;
6683                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6684                 ++s->mb_y;
6685                 if(FIELD_OR_MBAFF_PICTURE) {
6686                     ++s->mb_y;
6687                 }
6688                 if(s->mb_y >= s->mb_height){
6689                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6690
6691                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6692                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6693
6694                         return 0;
6695                     }else{
6696                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6697
6698                         return -1;
6699                     }
6700                 }
6701             }
6702
6703             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6704                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6705                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6706                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6707
6708                     return 0;
6709                 }else{
6710                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6711
6712                     return -1;
6713                 }
6714             }
6715         }
6716     }
6717
6718 #if 0
6719     for(;s->mb_y < s->mb_height; s->mb_y++){
6720         for(;s->mb_x < s->mb_width; s->mb_x++){
6721             int ret= decode_mb(h);
6722
6723             hl_decode_mb(h);
6724
6725             if(ret<0){
6726                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6727                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6728
6729                 return -1;
6730             }
6731
6732             if(++s->mb_x >= s->mb_width){
6733                 s->mb_x=0;
6734                 if(++s->mb_y >= s->mb_height){
6735                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6736                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6737
6738                         return 0;
6739                     }else{
6740                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6741
6742                         return -1;
6743                     }
6744                 }
6745             }
6746
6747             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6748                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6749                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6750
6751                     return 0;
6752                 }else{
6753                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6754
6755                     return -1;
6756                 }
6757             }
6758         }
6759         s->mb_x=0;
6760         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6761     }
6762 #endif
6763     return -1; //not reached
6764 }
6765
6766 static int decode_picture_timing(H264Context *h){
6767     MpegEncContext * const s = &h->s;
6768     if(h->sps.nal_hrd_parameters_present_flag || h->sps.vcl_hrd_parameters_present_flag){
6769         skip_bits(&s->gb, h->sps.cpb_removal_delay_length); /* cpb_removal_delay */
6770         skip_bits(&s->gb, h->sps.dpb_output_delay_length);  /* dpb_output_delay */
6771     }
6772     if(h->sps.pic_struct_present_flag){
6773         unsigned int i, num_clock_ts;
6774         h->sei_pic_struct = get_bits(&s->gb, 4);
6775
6776         if (h->sei_pic_struct > SEI_PIC_STRUCT_FRAME_TRIPLING)
6777             return -1;
6778
6779         num_clock_ts = sei_num_clock_ts_table[h->sei_pic_struct];
6780
6781         for (i = 0 ; i < num_clock_ts ; i++){
6782             if(get_bits(&s->gb, 1)){                  /* clock_timestamp_flag */
6783                 unsigned int full_timestamp_flag;
6784                 skip_bits(&s->gb, 2);                 /* ct_type */
6785                 skip_bits(&s->gb, 1);                 /* nuit_field_based_flag */
6786                 skip_bits(&s->gb, 5);                 /* counting_type */
6787                 full_timestamp_flag = get_bits(&s->gb, 1);
6788                 skip_bits(&s->gb, 1);                 /* discontinuity_flag */
6789                 skip_bits(&s->gb, 1);                 /* cnt_dropped_flag */
6790                 skip_bits(&s->gb, 8);                 /* n_frames */
6791                 if(full_timestamp_flag){
6792                     skip_bits(&s->gb, 6);             /* seconds_value 0..59 */
6793                     skip_bits(&s->gb, 6);             /* minutes_value 0..59 */
6794                     skip_bits(&s->gb, 5);             /* hours_value 0..23 */
6795                 }else{
6796                     if(get_bits(&s->gb, 1)){          /* seconds_flag */
6797                         skip_bits(&s->gb, 6);         /* seconds_value range 0..59 */
6798                         if(get_bits(&s->gb, 1)){      /* minutes_flag */
6799                             skip_bits(&s->gb, 6);     /* minutes_value 0..59 */
6800                             if(get_bits(&s->gb, 1))   /* hours_flag */
6801                                 skip_bits(&s->gb, 5); /* hours_value 0..23 */
6802                         }
6803                     }
6804                 }
6805                 if(h->sps.time_offset_length > 0)
6806                     skip_bits(&s->gb, h->sps.time_offset_length); /* time_offset */
6807             }
6808         }
6809     }
6810     return 0;
6811 }
6812
6813 static int decode_unregistered_user_data(H264Context *h, int size){
6814     MpegEncContext * const s = &h->s;
6815     uint8_t user_data[16+256];
6816     int e, build, i;
6817
6818     if(size<16)
6819         return -1;
6820
6821     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6822         user_data[i]= get_bits(&s->gb, 8);
6823     }
6824
6825     user_data[i]= 0;
6826     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6827     if(e==1 && build>=0)
6828         h->x264_build= build;
6829
6830     if(s->avctx->debug & FF_DEBUG_BUGS)
6831         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6832
6833     for(; i<size; i++)
6834         skip_bits(&s->gb, 8);
6835
6836     return 0;
6837 }
6838
6839 static int decode_sei(H264Context *h){
6840     MpegEncContext * const s = &h->s;
6841
6842     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6843         int size, type;
6844
6845         type=0;
6846         do{
6847             type+= show_bits(&s->gb, 8);
6848         }while(get_bits(&s->gb, 8) == 255);
6849
6850         size=0;
6851         do{
6852             size+= show_bits(&s->gb, 8);
6853         }while(get_bits(&s->gb, 8) == 255);
6854
6855         switch(type){
6856         case 1: // Picture timing SEI
6857             if(decode_picture_timing(h) < 0)
6858                 return -1;
6859             break;
6860         case 5:
6861             if(decode_unregistered_user_data(h, size) < 0)
6862                 return -1;
6863             break;
6864         default:
6865             skip_bits(&s->gb, 8*size);
6866         }
6867
6868         //FIXME check bits here
6869         align_get_bits(&s->gb);
6870     }
6871
6872     return 0;
6873 }
6874
6875 static inline int decode_hrd_parameters(H264Context *h, SPS *sps){
6876     MpegEncContext * const s = &h->s;
6877     int cpb_count, i;
6878     cpb_count = get_ue_golomb_31(&s->gb) + 1;
6879
6880     if(cpb_count > 32U){
6881         av_log(h->s.avctx, AV_LOG_ERROR, "cpb_count %d invalid\n", cpb_count);
6882         return -1;
6883     }
6884
6885     get_bits(&s->gb, 4); /* bit_rate_scale */
6886     get_bits(&s->gb, 4); /* cpb_size_scale */
6887     for(i=0; i<cpb_count; i++){
6888         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
6889         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
6890         get_bits1(&s->gb);     /* cbr_flag */
6891     }
6892     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
6893     sps->cpb_removal_delay_length = get_bits(&s->gb, 5) + 1;
6894     sps->dpb_output_delay_length = get_bits(&s->gb, 5) + 1;
6895     sps->time_offset_length = get_bits(&s->gb, 5);
6896     return 0;
6897 }
6898
6899 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
6900     MpegEncContext * const s = &h->s;
6901     int aspect_ratio_info_present_flag;
6902     unsigned int aspect_ratio_idc;
6903
6904     aspect_ratio_info_present_flag= get_bits1(&s->gb);
6905
6906     if( aspect_ratio_info_present_flag ) {
6907         aspect_ratio_idc= get_bits(&s->gb, 8);
6908         if( aspect_ratio_idc == EXTENDED_SAR ) {
6909             sps->sar.num= get_bits(&s->gb, 16);
6910             sps->sar.den= get_bits(&s->gb, 16);
6911         }else if(aspect_ratio_idc < FF_ARRAY_ELEMS(pixel_aspect)){
6912             sps->sar=  pixel_aspect[aspect_ratio_idc];
6913         }else{
6914             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
6915             return -1;
6916         }
6917     }else{
6918         sps->sar.num=
6919         sps->sar.den= 0;
6920     }
6921 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
6922
6923     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
6924         get_bits1(&s->gb);      /* overscan_appropriate_flag */
6925     }
6926
6927     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
6928         get_bits(&s->gb, 3);    /* video_format */
6929         get_bits1(&s->gb);      /* video_full_range_flag */
6930         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
6931             get_bits(&s->gb, 8); /* colour_primaries */
6932             get_bits(&s->gb, 8); /* transfer_characteristics */
6933             get_bits(&s->gb, 8); /* matrix_coefficients */
6934         }
6935     }
6936
6937     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
6938         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
6939         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
6940     }
6941
6942     sps->timing_info_present_flag = get_bits1(&s->gb);
6943     if(sps->timing_info_present_flag){
6944         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
6945         sps->time_scale = get_bits_long(&s->gb, 32);
6946         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
6947     }
6948
6949     sps->nal_hrd_parameters_present_flag = get_bits1(&s->gb);
6950     if(sps->nal_hrd_parameters_present_flag)
6951         if(decode_hrd_parameters(h, sps) < 0)
6952             return -1;
6953     sps->vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
6954     if(sps->vcl_hrd_parameters_present_flag)
6955         if(decode_hrd_parameters(h, sps) < 0)
6956             return -1;
6957     if(sps->nal_hrd_parameters_present_flag || sps->vcl_hrd_parameters_present_flag)
6958         get_bits1(&s->gb);     /* low_delay_hrd_flag */
6959     sps->pic_struct_present_flag = get_bits1(&s->gb);
6960
6961     sps->bitstream_restriction_flag = get_bits1(&s->gb);
6962     if(sps->bitstream_restriction_flag){
6963         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
6964         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
6965         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
6966         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
6967         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
6968         sps->num_reorder_frames= get_ue_golomb(&s->gb);
6969         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
6970
6971         if(sps->num_reorder_frames > 16U /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
6972             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", sps->num_reorder_frames);
6973             return -1;
6974         }
6975     }
6976
6977     return 0;
6978 }
6979
6980 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
6981                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
6982     MpegEncContext * const s = &h->s;
6983     int i, last = 8, next = 8;
6984     const uint8_t *scan = size == 16 ? zigzag_scan : ff_zigzag_direct;
6985     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
6986         memcpy(factors, fallback_list, size*sizeof(uint8_t));
6987     else
6988     for(i=0;i<size;i++){
6989         if(next)
6990             next = (last + get_se_golomb(&s->gb)) & 0xff;
6991         if(!i && !next){ /* matrix not written, we use the preset one */
6992             memcpy(factors, jvt_list, size*sizeof(uint8_t));
6993             break;
6994         }
6995         last = factors[scan[i]] = next ? next : last;
6996     }
6997 }
6998
6999 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7000                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7001     MpegEncContext * const s = &h->s;
7002     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7003     const uint8_t *fallback[4] = {
7004         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7005         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7006         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7007         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7008     };
7009     if(get_bits1(&s->gb)){
7010         sps->scaling_matrix_present |= is_sps;
7011         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7012         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7013         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7014         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7015         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7016         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7017         if(is_sps || pps->transform_8x8_mode){
7018             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7019             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7020         }
7021     }
7022 }
7023
7024 static inline int decode_seq_parameter_set(H264Context *h){
7025     MpegEncContext * const s = &h->s;
7026     int profile_idc, level_idc;
7027     unsigned int sps_id;
7028     int i;
7029     SPS *sps;
7030
7031     profile_idc= get_bits(&s->gb, 8);
7032     get_bits1(&s->gb);   //constraint_set0_flag
7033     get_bits1(&s->gb);   //constraint_set1_flag
7034     get_bits1(&s->gb);   //constraint_set2_flag
7035     get_bits1(&s->gb);   //constraint_set3_flag
7036     get_bits(&s->gb, 4); // reserved
7037     level_idc= get_bits(&s->gb, 8);
7038     sps_id= get_ue_golomb_31(&s->gb);
7039
7040     if(sps_id >= MAX_SPS_COUNT) {
7041         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id (%d) out of range\n", sps_id);
7042         return -1;
7043     }
7044     sps= av_mallocz(sizeof(SPS));
7045     if(sps == NULL)
7046         return -1;
7047
7048     sps->profile_idc= profile_idc;
7049     sps->level_idc= level_idc;
7050
7051     memset(sps->scaling_matrix4, 16, sizeof(sps->scaling_matrix4));
7052     memset(sps->scaling_matrix8, 16, sizeof(sps->scaling_matrix8));
7053     sps->scaling_matrix_present = 0;
7054
7055     if(sps->profile_idc >= 100){ //high profile
7056         sps->chroma_format_idc= get_ue_golomb_31(&s->gb);
7057         if(sps->chroma_format_idc == 3)
7058             get_bits1(&s->gb);  //residual_color_transform_flag
7059         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
7060         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
7061         sps->transform_bypass = get_bits1(&s->gb);
7062         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7063     }else{
7064         sps->chroma_format_idc= 1;
7065     }
7066
7067     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7068     sps->poc_type= get_ue_golomb_31(&s->gb);
7069
7070     if(sps->poc_type == 0){ //FIXME #define
7071         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7072     } else if(sps->poc_type == 1){//FIXME #define
7073         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7074         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7075         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7076         sps->poc_cycle_length                = get_ue_golomb(&s->gb);
7077
7078         if((unsigned)sps->poc_cycle_length >= FF_ARRAY_ELEMS(sps->offset_for_ref_frame)){
7079             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", sps->poc_cycle_length);
7080             goto fail;
7081         }
7082
7083         for(i=0; i<sps->poc_cycle_length; i++)
7084             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7085     }else if(sps->poc_type != 2){
7086         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7087         goto fail;
7088     }
7089
7090     sps->ref_frame_count= get_ue_golomb_31(&s->gb);
7091     if(sps->ref_frame_count > MAX_PICTURE_COUNT-2 || sps->ref_frame_count >= 32U){
7092         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7093         goto fail;
7094     }
7095     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7096     sps->mb_width = get_ue_golomb(&s->gb) + 1;
7097     sps->mb_height= get_ue_golomb(&s->gb) + 1;
7098     if((unsigned)sps->mb_width >= INT_MAX/16 || (unsigned)sps->mb_height >= INT_MAX/16 ||
7099        avcodec_check_dimensions(NULL, 16*sps->mb_width, 16*sps->mb_height)){
7100         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7101         goto fail;
7102     }
7103
7104     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7105     if(!sps->frame_mbs_only_flag)
7106         sps->mb_aff= get_bits1(&s->gb);
7107     else
7108         sps->mb_aff= 0;
7109
7110     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7111
7112 #ifndef ALLOW_INTERLACE
7113     if(sps->mb_aff)
7114         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7115 #endif
7116     sps->crop= get_bits1(&s->gb);
7117     if(sps->crop){
7118         sps->crop_left  = get_ue_golomb(&s->gb);
7119         sps->crop_right = get_ue_golomb(&s->gb);
7120         sps->crop_top   = get_ue_golomb(&s->gb);
7121         sps->crop_bottom= get_ue_golomb(&s->gb);
7122         if(sps->crop_left || sps->crop_top){
7123             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7124         }
7125         if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !sps->frame_mbs_only_flag)){
7126             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
7127         }
7128     }else{
7129         sps->crop_left  =
7130         sps->crop_right =
7131         sps->crop_top   =
7132         sps->crop_bottom= 0;
7133     }
7134
7135     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7136     if( sps->vui_parameters_present_flag )
7137         decode_vui_parameters(h, sps);
7138
7139     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7140         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s %s\n",
7141                sps_id, sps->profile_idc, sps->level_idc,
7142                sps->poc_type,
7143                sps->ref_frame_count,
7144                sps->mb_width, sps->mb_height,
7145                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7146                sps->direct_8x8_inference_flag ? "8B8" : "",
7147                sps->crop_left, sps->crop_right,
7148                sps->crop_top, sps->crop_bottom,
7149                sps->vui_parameters_present_flag ? "VUI" : "",
7150                ((const char*[]){"Gray","420","422","444"})[sps->chroma_format_idc]
7151                );
7152     }
7153     av_free(h->sps_buffers[sps_id]);
7154     h->sps_buffers[sps_id]= sps;
7155     return 0;
7156 fail:
7157     av_free(sps);
7158     return -1;
7159 }
7160
7161 static void
7162 build_qp_table(PPS *pps, int t, int index)
7163 {
7164     int i;
7165     for(i = 0; i < 52; i++)
7166         pps->chroma_qp_table[t][i] = chroma_qp[av_clip(i + index, 0, 51)];
7167 }
7168
7169 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7170     MpegEncContext * const s = &h->s;
7171     unsigned int pps_id= get_ue_golomb(&s->gb);
7172     PPS *pps;
7173
7174     if(pps_id >= MAX_PPS_COUNT) {
7175         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id (%d) out of range\n", pps_id);
7176         return -1;
7177     }
7178
7179     pps= av_mallocz(sizeof(PPS));
7180     if(pps == NULL)
7181         return -1;
7182     pps->sps_id= get_ue_golomb_31(&s->gb);
7183     if((unsigned)pps->sps_id>=MAX_SPS_COUNT || h->sps_buffers[pps->sps_id] == NULL){
7184         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7185         goto fail;
7186     }
7187
7188     pps->cabac= get_bits1(&s->gb);
7189     pps->pic_order_present= get_bits1(&s->gb);
7190     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7191     if(pps->slice_group_count > 1 ){
7192         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7193         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7194         switch(pps->mb_slice_group_map_type){
7195         case 0:
7196 #if 0
7197 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7198 |    run_length[ i ]                                |1  |ue(v)   |
7199 #endif
7200             break;
7201         case 2:
7202 #if 0
7203 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7204 |{                                                  |   |        |
7205 |    top_left_mb[ i ]                               |1  |ue(v)   |
7206 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7207 |   }                                               |   |        |
7208 #endif
7209             break;
7210         case 3:
7211         case 4:
7212         case 5:
7213 #if 0
7214 |   slice_group_change_direction_flag               |1  |u(1)    |
7215 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7216 #endif
7217             break;
7218         case 6:
7219 #if 0
7220 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7221 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7222 |)                                                  |   |        |
7223 |    slice_group_id[ i ]                            |1  |u(v)    |
7224 #endif
7225             break;
7226         }
7227     }
7228     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7229     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7230     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7231         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7232         goto fail;
7233     }
7234
7235     pps->weighted_pred= get_bits1(&s->gb);
7236     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7237     pps->init_qp= get_se_golomb(&s->gb) + 26;
7238     pps->init_qs= get_se_golomb(&s->gb) + 26;
7239     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7240     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7241     pps->constrained_intra_pred= get_bits1(&s->gb);
7242     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7243
7244     pps->transform_8x8_mode= 0;
7245     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7246     memcpy(pps->scaling_matrix4, h->sps_buffers[pps->sps_id]->scaling_matrix4, sizeof(pps->scaling_matrix4));
7247     memcpy(pps->scaling_matrix8, h->sps_buffers[pps->sps_id]->scaling_matrix8, sizeof(pps->scaling_matrix8));
7248
7249     if(get_bits_count(&s->gb) < bit_length){
7250         pps->transform_8x8_mode= get_bits1(&s->gb);
7251         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7252         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7253     } else {
7254         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7255     }
7256
7257     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7258     build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7259     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1])
7260         h->pps.chroma_qp_diff= 1;
7261
7262     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7263         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7264                pps_id, pps->sps_id,
7265                pps->cabac ? "CABAC" : "CAVLC",
7266                pps->slice_group_count,
7267                pps->ref_count[0], pps->ref_count[1],
7268                pps->weighted_pred ? "weighted" : "",
7269                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7270                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7271                pps->constrained_intra_pred ? "CONSTR" : "",
7272                pps->redundant_pic_cnt_present ? "REDU" : "",
7273                pps->transform_8x8_mode ? "8x8DCT" : ""
7274                );
7275     }
7276
7277     av_free(h->pps_buffers[pps_id]);
7278     h->pps_buffers[pps_id]= pps;
7279     return 0;
7280 fail:
7281     av_free(pps);
7282     return -1;
7283 }
7284
7285 /**
7286  * Call decode_slice() for each context.
7287  *
7288  * @param h h264 master context
7289  * @param context_count number of contexts to execute
7290  */
7291 static void execute_decode_slices(H264Context *h, int context_count){
7292     MpegEncContext * const s = &h->s;
7293     AVCodecContext * const avctx= s->avctx;
7294     H264Context *hx;
7295     int i;
7296
7297     if(s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
7298         return;
7299     if(context_count == 1) {
7300         decode_slice(avctx, &h);
7301     } else {
7302         for(i = 1; i < context_count; i++) {
7303             hx = h->thread_context[i];
7304             hx->s.error_recognition = avctx->error_recognition;
7305             hx->s.error_count = 0;
7306         }
7307
7308         avctx->execute(avctx, (void *)decode_slice,
7309                        (void **)h->thread_context, NULL, context_count, sizeof(void*));
7310
7311         /* pull back stuff from slices to master context */
7312         hx = h->thread_context[context_count - 1];
7313         s->mb_x = hx->s.mb_x;
7314         s->mb_y = hx->s.mb_y;
7315         s->dropable = hx->s.dropable;
7316         s->picture_structure = hx->s.picture_structure;
7317         for(i = 1; i < context_count; i++)
7318             h->s.error_count += h->thread_context[i]->s.error_count;
7319     }
7320 }
7321
7322
7323 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7324     MpegEncContext * const s = &h->s;
7325     AVCodecContext * const avctx= s->avctx;
7326     int buf_index=0;
7327     H264Context *hx; ///< thread context
7328     int context_count = 0;
7329
7330     h->max_contexts = avctx->thread_count;
7331 #if 0
7332     int i;
7333     for(i=0; i<50; i++){
7334         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7335     }
7336 #endif
7337     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7338         h->current_slice = 0;
7339         if (!s->first_field)
7340             s->current_picture_ptr= NULL;
7341     }
7342
7343     for(;;){
7344         int consumed;
7345         int dst_length;
7346         int bit_length;
7347         const uint8_t *ptr;
7348         int i, nalsize = 0;
7349         int err;
7350
7351         if(h->is_avc) {
7352             if(buf_index >= buf_size) break;
7353             nalsize = 0;
7354             for(i = 0; i < h->nal_length_size; i++)
7355                 nalsize = (nalsize << 8) | buf[buf_index++];
7356             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7357                 if(nalsize == 1){
7358                     buf_index++;
7359                     continue;
7360                 }else{
7361                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7362                     break;
7363                 }
7364             }
7365         } else {
7366             // start code prefix search
7367             for(; buf_index + 3 < buf_size; buf_index++){
7368                 // This should always succeed in the first iteration.
7369                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7370                     break;
7371             }
7372
7373             if(buf_index+3 >= buf_size) break;
7374
7375             buf_index+=3;
7376         }
7377
7378         hx = h->thread_context[context_count];
7379
7380         ptr= decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7381         if (ptr==NULL || dst_length < 0){
7382             return -1;
7383         }
7384         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7385             dst_length--;
7386         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
7387
7388         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7389             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7390         }
7391
7392         if (h->is_avc && (nalsize != consumed)){
7393             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7394             consumed= nalsize;
7395         }
7396
7397         buf_index += consumed;
7398
7399         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7400            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7401             continue;
7402
7403       again:
7404         err = 0;
7405         switch(hx->nal_unit_type){
7406         case NAL_IDR_SLICE:
7407             if (h->nal_unit_type != NAL_IDR_SLICE) {
7408                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7409                 return -1;
7410             }
7411             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7412         case NAL_SLICE:
7413             init_get_bits(&hx->s.gb, ptr, bit_length);
7414             hx->intra_gb_ptr=
7415             hx->inter_gb_ptr= &hx->s.gb;
7416             hx->s.data_partitioning = 0;
7417
7418             if((err = decode_slice_header(hx, h)))
7419                break;
7420
7421             s->current_picture_ptr->key_frame|= (hx->nal_unit_type == NAL_IDR_SLICE);
7422             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7423                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7424                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7425                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7426                && avctx->skip_frame < AVDISCARD_ALL){
7427                 if(CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU){
7428                     static const uint8_t start_code[] = {0x00, 0x00, 0x01};
7429                     ff_vdpau_add_data_chunk(s, start_code, sizeof(start_code));
7430                     ff_vdpau_add_data_chunk(s, &buf[buf_index - consumed], consumed );
7431                 }else
7432                     context_count++;
7433             }
7434             break;
7435         case NAL_DPA:
7436             init_get_bits(&hx->s.gb, ptr, bit_length);
7437             hx->intra_gb_ptr=
7438             hx->inter_gb_ptr= NULL;
7439             hx->s.data_partitioning = 1;
7440
7441             err = decode_slice_header(hx, h);
7442             break;
7443         case NAL_DPB:
7444             init_get_bits(&hx->intra_gb, ptr, bit_length);
7445             hx->intra_gb_ptr= &hx->intra_gb;
7446             break;
7447         case NAL_DPC:
7448             init_get_bits(&hx->inter_gb, ptr, bit_length);
7449             hx->inter_gb_ptr= &hx->inter_gb;
7450
7451             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7452                && s->context_initialized
7453                && s->hurry_up < 5
7454                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7455                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7456                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7457                && avctx->skip_frame < AVDISCARD_ALL)
7458                 context_count++;
7459             break;
7460         case NAL_SEI:
7461             init_get_bits(&s->gb, ptr, bit_length);
7462             decode_sei(h);
7463             break;
7464         case NAL_SPS:
7465             init_get_bits(&s->gb, ptr, bit_length);
7466             decode_seq_parameter_set(h);
7467
7468             if(s->flags& CODEC_FLAG_LOW_DELAY)
7469                 s->low_delay=1;
7470
7471             if(avctx->has_b_frames < 2)
7472                 avctx->has_b_frames= !s->low_delay;
7473             break;
7474         case NAL_PPS:
7475             init_get_bits(&s->gb, ptr, bit_length);
7476
7477             decode_picture_parameter_set(h, bit_length);
7478
7479             break;
7480         case NAL_AUD:
7481         case NAL_END_SEQUENCE:
7482         case NAL_END_STREAM:
7483         case NAL_FILLER_DATA:
7484         case NAL_SPS_EXT:
7485         case NAL_AUXILIARY_SLICE:
7486             break;
7487         default:
7488             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7489         }
7490
7491         if(context_count == h->max_contexts) {
7492             execute_decode_slices(h, context_count);
7493             context_count = 0;
7494         }
7495
7496         if (err < 0)
7497             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7498         else if(err == 1) {
7499             /* Slice could not be decoded in parallel mode, copy down
7500              * NAL unit stuff to context 0 and restart. Note that
7501              * rbsp_buffer is not transferred, but since we no longer
7502              * run in parallel mode this should not be an issue. */
7503             h->nal_unit_type = hx->nal_unit_type;
7504             h->nal_ref_idc   = hx->nal_ref_idc;
7505             hx = h;
7506             goto again;
7507         }
7508     }
7509     if(context_count)
7510         execute_decode_slices(h, context_count);
7511     return buf_index;
7512 }
7513
7514 /**
7515  * returns the number of bytes consumed for building the current frame
7516  */
7517 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7518         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7519         if(pos+10>buf_size) pos=buf_size; // oops ;)
7520
7521         return pos;
7522 }
7523
7524 static int decode_frame(AVCodecContext *avctx,
7525                              void *data, int *data_size,
7526                              const uint8_t *buf, int buf_size)
7527 {
7528     H264Context *h = avctx->priv_data;
7529     MpegEncContext *s = &h->s;
7530     AVFrame *pict = data;
7531     int buf_index;
7532
7533     s->flags= avctx->flags;
7534     s->flags2= avctx->flags2;
7535
7536    /* end of stream, output what is still in the buffers */
7537     if (buf_size == 0) {
7538         Picture *out;
7539         int i, out_idx;
7540
7541 //FIXME factorize this with the output code below
7542         out = h->delayed_pic[0];
7543         out_idx = 0;
7544         for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7545             if(h->delayed_pic[i]->poc < out->poc){
7546                 out = h->delayed_pic[i];
7547                 out_idx = i;
7548             }
7549
7550         for(i=out_idx; h->delayed_pic[i]; i++)
7551             h->delayed_pic[i] = h->delayed_pic[i+1];
7552
7553         if(out){
7554             *data_size = sizeof(AVFrame);
7555             *pict= *(AVFrame*)out;
7556         }
7557
7558         return 0;
7559     }
7560
7561     if(h->is_avc && !h->got_avcC) {
7562         int i, cnt, nalsize;
7563         unsigned char *p = avctx->extradata;
7564         if(avctx->extradata_size < 7) {
7565             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7566             return -1;
7567         }
7568         if(*p != 1) {
7569             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7570             return -1;
7571         }
7572         /* sps and pps in the avcC always have length coded with 2 bytes,
7573            so put a fake nal_length_size = 2 while parsing them */
7574         h->nal_length_size = 2;
7575         // Decode sps from avcC
7576         cnt = *(p+5) & 0x1f; // Number of sps
7577         p += 6;
7578         for (i = 0; i < cnt; i++) {
7579             nalsize = AV_RB16(p) + 2;
7580             if(decode_nal_units(h, p, nalsize) < 0) {
7581                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7582                 return -1;
7583             }
7584             p += nalsize;
7585         }
7586         // Decode pps from avcC
7587         cnt = *(p++); // Number of pps
7588         for (i = 0; i < cnt; i++) {
7589             nalsize = AV_RB16(p) + 2;
7590             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7591                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7592                 return -1;
7593             }
7594             p += nalsize;
7595         }
7596         // Now store right nal length size, that will be use to parse all other nals
7597         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7598         // Do not reparse avcC
7599         h->got_avcC = 1;
7600     }
7601
7602     if(!h->got_avcC && !h->is_avc && s->avctx->extradata_size){
7603         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7604             return -1;
7605         h->got_avcC = 1;
7606     }
7607
7608     buf_index=decode_nal_units(h, buf, buf_size);
7609     if(buf_index < 0)
7610         return -1;
7611
7612     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7613         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7614         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7615         return -1;
7616     }
7617
7618     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7619         Picture *out = s->current_picture_ptr;
7620         Picture *cur = s->current_picture_ptr;
7621         int i, pics, cross_idr, out_of_order, out_idx;
7622
7623         s->mb_y= 0;
7624
7625         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7626         s->current_picture_ptr->pict_type= s->pict_type;
7627
7628         if (CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
7629             ff_vdpau_h264_set_reference_frames(s);
7630
7631         if(!s->dropable) {
7632             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7633             h->prev_poc_msb= h->poc_msb;
7634             h->prev_poc_lsb= h->poc_lsb;
7635         }
7636         h->prev_frame_num_offset= h->frame_num_offset;
7637         h->prev_frame_num= h->frame_num;
7638
7639         if (CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
7640             ff_vdpau_h264_picture_complete(s);
7641
7642         /*
7643          * FIXME: Error handling code does not seem to support interlaced
7644          * when slices span multiple rows
7645          * The ff_er_add_slice calls don't work right for bottom
7646          * fields; they cause massive erroneous error concealing
7647          * Error marking covers both fields (top and bottom).
7648          * This causes a mismatched s->error_count
7649          * and a bad error table. Further, the error count goes to
7650          * INT_MAX when called for bottom field, because mb_y is
7651          * past end by one (callers fault) and resync_mb_y != 0
7652          * causes problems for the first MB line, too.
7653          */
7654         if (!FIELD_PICTURE)
7655             ff_er_frame_end(s);
7656
7657         MPV_frame_end(s);
7658
7659         if (cur->field_poc[0]==INT_MAX || cur->field_poc[1]==INT_MAX) {
7660             /* Wait for second field. */
7661             *data_size = 0;
7662
7663         } else {
7664             cur->repeat_pict = 0;
7665
7666             /* Signal interlacing information externally. */
7667             /* Prioritize picture timing SEI information over used decoding process if it exists. */
7668             if(h->sps.pic_struct_present_flag){
7669                 switch (h->sei_pic_struct)
7670                 {
7671                 case SEI_PIC_STRUCT_FRAME:
7672                     cur->interlaced_frame = 0;
7673                     break;
7674                 case SEI_PIC_STRUCT_TOP_FIELD:
7675                 case SEI_PIC_STRUCT_BOTTOM_FIELD:
7676                 case SEI_PIC_STRUCT_TOP_BOTTOM:
7677                 case SEI_PIC_STRUCT_BOTTOM_TOP:
7678                     cur->interlaced_frame = 1;
7679                     break;
7680                 case SEI_PIC_STRUCT_TOP_BOTTOM_TOP:
7681                 case SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM:
7682                     // Signal the possibility of telecined film externally (pic_struct 5,6)
7683                     // From these hints, let the applications decide if they apply deinterlacing.
7684                     cur->repeat_pict = 1;
7685                     cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7686                     break;
7687                 case SEI_PIC_STRUCT_FRAME_DOUBLING:
7688                     // Force progressive here, as doubling interlaced frame is a bad idea.
7689                     cur->interlaced_frame = 0;
7690                     cur->repeat_pict = 2;
7691                     break;
7692                 case SEI_PIC_STRUCT_FRAME_TRIPLING:
7693                     cur->interlaced_frame = 0;
7694                     cur->repeat_pict = 4;
7695                     break;
7696                 }
7697             }else{
7698                 /* Derive interlacing flag from used decoding process. */
7699                 cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7700             }
7701
7702             if (cur->field_poc[0] != cur->field_poc[1]){
7703                 /* Derive top_field_first from field pocs. */
7704                 cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7705             }else{
7706                 if(cur->interlaced_frame || h->sps.pic_struct_present_flag){
7707                     /* Use picture timing SEI information. Even if it is a information of a past frame, better than nothing. */
7708                     if(h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM
7709                       || h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM_TOP)
7710                         cur->top_field_first = 1;
7711                     else
7712                         cur->top_field_first = 0;
7713                 }else{
7714                     /* Most likely progressive */
7715                     cur->top_field_first = 0;
7716                 }
7717             }
7718
7719         //FIXME do something with unavailable reference frames
7720
7721             /* Sort B-frames into display order */
7722
7723             if(h->sps.bitstream_restriction_flag
7724                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7725                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7726                 s->low_delay = 0;
7727             }
7728
7729             if(   s->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT
7730                && !h->sps.bitstream_restriction_flag){
7731                 s->avctx->has_b_frames= MAX_DELAYED_PIC_COUNT;
7732                 s->low_delay= 0;
7733             }
7734
7735             pics = 0;
7736             while(h->delayed_pic[pics]) pics++;
7737
7738             assert(pics <= MAX_DELAYED_PIC_COUNT);
7739
7740             h->delayed_pic[pics++] = cur;
7741             if(cur->reference == 0)
7742                 cur->reference = DELAYED_PIC_REF;
7743
7744             out = h->delayed_pic[0];
7745             out_idx = 0;
7746             for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7747                 if(h->delayed_pic[i]->poc < out->poc){
7748                     out = h->delayed_pic[i];
7749                     out_idx = i;
7750                 }
7751             cross_idr = !h->delayed_pic[0]->poc || !!h->delayed_pic[i] || h->delayed_pic[0]->key_frame;
7752
7753             out_of_order = !cross_idr && out->poc < h->outputed_poc;
7754
7755             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7756                 { }
7757             else if((out_of_order && pics-1 == s->avctx->has_b_frames && s->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT)
7758                || (s->low_delay &&
7759                 ((!cross_idr && out->poc > h->outputed_poc + 2)
7760                  || cur->pict_type == FF_B_TYPE)))
7761             {
7762                 s->low_delay = 0;
7763                 s->avctx->has_b_frames++;
7764             }
7765
7766             if(out_of_order || pics > s->avctx->has_b_frames){
7767                 out->reference &= ~DELAYED_PIC_REF;
7768                 for(i=out_idx; h->delayed_pic[i]; i++)
7769                     h->delayed_pic[i] = h->delayed_pic[i+1];
7770             }
7771             if(!out_of_order && pics > s->avctx->has_b_frames){
7772                 *data_size = sizeof(AVFrame);
7773
7774                 h->outputed_poc = out->poc;
7775                 *pict= *(AVFrame*)out;
7776             }else{
7777                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7778             }
7779         }
7780     }
7781
7782     assert(pict->data[0] || !*data_size);
7783     ff_print_debug_info(s, pict);
7784 //printf("out %d\n", (int)pict->data[0]);
7785 #if 0 //?
7786
7787     /* Return the Picture timestamp as the frame number */
7788     /* we subtract 1 because it is added on utils.c     */
7789     avctx->frame_number = s->picture_number - 1;
7790 #endif
7791     return get_consumed_bytes(s, buf_index, buf_size);
7792 }
7793 #if 0
7794 static inline void fill_mb_avail(H264Context *h){
7795     MpegEncContext * const s = &h->s;
7796     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7797
7798     if(s->mb_y){
7799         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7800         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7801         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7802     }else{
7803         h->mb_avail[0]=
7804         h->mb_avail[1]=
7805         h->mb_avail[2]= 0;
7806     }
7807     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7808     h->mb_avail[4]= 1; //FIXME move out
7809     h->mb_avail[5]= 0; //FIXME move out
7810 }
7811 #endif
7812
7813 #ifdef TEST
7814 #undef printf
7815 #undef random
7816 #define COUNT 8000
7817 #define SIZE (COUNT*40)
7818 int main(void){
7819     int i;
7820     uint8_t temp[SIZE];
7821     PutBitContext pb;
7822     GetBitContext gb;
7823 //    int int_temp[10000];
7824     DSPContext dsp;
7825     AVCodecContext avctx;
7826
7827     dsputil_init(&dsp, &avctx);
7828
7829     init_put_bits(&pb, temp, SIZE);
7830     printf("testing unsigned exp golomb\n");
7831     for(i=0; i<COUNT; i++){
7832         START_TIMER
7833         set_ue_golomb(&pb, i);
7834         STOP_TIMER("set_ue_golomb");
7835     }
7836     flush_put_bits(&pb);
7837
7838     init_get_bits(&gb, temp, 8*SIZE);
7839     for(i=0; i<COUNT; i++){
7840         int j, s;
7841
7842         s= show_bits(&gb, 24);
7843
7844         START_TIMER
7845         j= get_ue_golomb(&gb);
7846         if(j != i){
7847             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7848 //            return -1;
7849         }
7850         STOP_TIMER("get_ue_golomb");
7851     }
7852
7853
7854     init_put_bits(&pb, temp, SIZE);
7855     printf("testing signed exp golomb\n");
7856     for(i=0; i<COUNT; i++){
7857         START_TIMER
7858         set_se_golomb(&pb, i - COUNT/2);
7859         STOP_TIMER("set_se_golomb");
7860     }
7861     flush_put_bits(&pb);
7862
7863     init_get_bits(&gb, temp, 8*SIZE);
7864     for(i=0; i<COUNT; i++){
7865         int j, s;
7866
7867         s= show_bits(&gb, 24);
7868
7869         START_TIMER
7870         j= get_se_golomb(&gb);
7871         if(j != i - COUNT/2){
7872             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7873 //            return -1;
7874         }
7875         STOP_TIMER("get_se_golomb");
7876     }
7877
7878 #if 0
7879     printf("testing 4x4 (I)DCT\n");
7880
7881     DCTELEM block[16];
7882     uint8_t src[16], ref[16];
7883     uint64_t error= 0, max_error=0;
7884
7885     for(i=0; i<COUNT; i++){
7886         int j;
7887 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
7888         for(j=0; j<16; j++){
7889             ref[j]= random()%255;
7890             src[j]= random()%255;
7891         }
7892
7893         h264_diff_dct_c(block, src, ref, 4);
7894
7895         //normalize
7896         for(j=0; j<16; j++){
7897 //            printf("%d ", block[j]);
7898             block[j]= block[j]*4;
7899             if(j&1) block[j]= (block[j]*4 + 2)/5;
7900             if(j&4) block[j]= (block[j]*4 + 2)/5;
7901         }
7902 //        printf("\n");
7903
7904         s->dsp.h264_idct_add(ref, block, 4);
7905 /*        for(j=0; j<16; j++){
7906             printf("%d ", ref[j]);
7907         }
7908         printf("\n");*/
7909
7910         for(j=0; j<16; j++){
7911             int diff= FFABS(src[j] - ref[j]);
7912
7913             error+= diff*diff;
7914             max_error= FFMAX(max_error, diff);
7915         }
7916     }
7917     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
7918     printf("testing quantizer\n");
7919     for(qp=0; qp<52; qp++){
7920         for(i=0; i<16; i++)
7921             src1_block[i]= src2_block[i]= random()%255;
7922
7923     }
7924     printf("Testing NAL layer\n");
7925
7926     uint8_t bitstream[COUNT];
7927     uint8_t nal[COUNT*2];
7928     H264Context h;
7929     memset(&h, 0, sizeof(H264Context));
7930
7931     for(i=0; i<COUNT; i++){
7932         int zeros= i;
7933         int nal_length;
7934         int consumed;
7935         int out_length;
7936         uint8_t *out;
7937         int j;
7938
7939         for(j=0; j<COUNT; j++){
7940             bitstream[j]= (random() % 255) + 1;
7941         }
7942
7943         for(j=0; j<zeros; j++){
7944             int pos= random() % COUNT;
7945             while(bitstream[pos] == 0){
7946                 pos++;
7947                 pos %= COUNT;
7948             }
7949             bitstream[pos]=0;
7950         }
7951
7952         START_TIMER
7953
7954         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
7955         if(nal_length<0){
7956             printf("encoding failed\n");
7957             return -1;
7958         }
7959
7960         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
7961
7962         STOP_TIMER("NAL")
7963
7964         if(out_length != COUNT){
7965             printf("incorrect length %d %d\n", out_length, COUNT);
7966             return -1;
7967         }
7968
7969         if(consumed != nal_length){
7970             printf("incorrect consumed length %d %d\n", nal_length, consumed);
7971             return -1;
7972         }
7973
7974         if(memcmp(bitstream, out, COUNT)){
7975             printf("mismatch\n");
7976             return -1;
7977         }
7978     }
7979 #endif
7980
7981     printf("Testing RBSP\n");
7982
7983
7984     return 0;
7985 }
7986 #endif /* TEST */
7987
7988
7989 static av_cold int decode_end(AVCodecContext *avctx)
7990 {
7991     H264Context *h = avctx->priv_data;
7992     MpegEncContext *s = &h->s;
7993     int i;
7994
7995     av_freep(&h->rbsp_buffer[0]);
7996     av_freep(&h->rbsp_buffer[1]);
7997     free_tables(h); //FIXME cleanup init stuff perhaps
7998
7999     for(i = 0; i < MAX_SPS_COUNT; i++)
8000         av_freep(h->sps_buffers + i);
8001
8002     for(i = 0; i < MAX_PPS_COUNT; i++)
8003         av_freep(h->pps_buffers + i);
8004
8005     MPV_common_end(s);
8006
8007 //    memset(h, 0, sizeof(H264Context));
8008
8009     return 0;
8010 }
8011
8012
8013 AVCodec h264_decoder = {
8014     "h264",
8015     CODEC_TYPE_VIDEO,
8016     CODEC_ID_H264,
8017     sizeof(H264Context),
8018     decode_init,
8019     NULL,
8020     decode_end,
8021     decode_frame,
8022     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_DELAY,
8023     .flush= flush_dpb,
8024     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
8025 };
8026
8027 #if CONFIG_H264_VDPAU_DECODER
8028 AVCodec h264_vdpau_decoder = {
8029     "h264_vdpau",
8030     CODEC_TYPE_VIDEO,
8031     CODEC_ID_H264,
8032     sizeof(H264Context),
8033     decode_init,
8034     NULL,
8035     decode_end,
8036     decode_frame,
8037     CODEC_CAP_DR1 | CODEC_CAP_DELAY | CODEC_CAP_HWACCEL_VDPAU,
8038     .flush= flush_dpb,
8039     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10 (VDPAU acceleration)"),
8040 };
8041 #endif
8042
8043 #if CONFIG_SVQ3_DECODER
8044 #include "svq3.c"
8045 #endif