libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file libavcodec/h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "internal.h"
  29 #include "dsputil.h"
  30 #include "avcodec.h"
  31 #include "mpegvideo.h"
  32 #include "h264.h"
  33 #include "h264data.h"
  34 #include "h264_parser.h"
  35 #include "golomb.h"
  36 #include "mathops.h"
  37 #include "rectangle.h"
  38 #include "vdpau_internal.h"
  39
  40 #include "cabac.h"
  41 #if ARCH_X86
  42 #include "x86/h264_i386.h"
  43 #endif
  44
  45 //#undef NDEBUG
  46 #include <assert.h>
  47
  48 /**
  49  * Value of Picture.reference when Picture is not a reference picture, but
  50  * is held for delayed output.
  51  */
  52 #define DELAYED_PIC_REF 4
  53
  54 static VLC coeff_token_vlc[4];
  55 static VLC_TYPE coeff_token_vlc_tables[520+332+280+256][2];
  56 static const int coeff_token_vlc_tables_size[4]={520,332,280,256};
  57
  58 static VLC chroma_dc_coeff_token_vlc;
  59 static VLC_TYPE chroma_dc_coeff_token_vlc_table[256][2];
  60 static const int chroma_dc_coeff_token_vlc_table_size = 256;
  61
  62 static VLC total_zeros_vlc[15];
  63 static VLC_TYPE total_zeros_vlc_tables[15][512][2];
  64 static const int total_zeros_vlc_tables_size = 512;
  65
  66 static VLC chroma_dc_total_zeros_vlc[3];
  67 static VLC_TYPE chroma_dc_total_zeros_vlc_tables[3][8][2];
  68 static const int chroma_dc_total_zeros_vlc_tables_size = 8;
  69
  70 static VLC run_vlc[6];
  71 static VLC_TYPE run_vlc_tables[6][8][2];
  72 static const int run_vlc_tables_size = 8;
  73
  74 static VLC run7_vlc;
  75 static VLC_TYPE run7_vlc_table[96][2];
  76 static const int run7_vlc_table_size = 96;
  77
  78 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  79 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  80 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  81 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  82 static Picture * remove_long(H264Context *h, int i, int ref_mask);
  83
  84 static av_always_inline uint32_t pack16to32(int a, int b){
  85 #ifdef WORDS_BIGENDIAN
  86    return (b&0xFFFF) + (a<<16);
  87 #else
  88    return (a&0xFFFF) + (b<<16);
  89 #endif
  90 }
  91
  92 static const uint8_t rem6[52]={
  93 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  94 };
  95
  96 static const uint8_t div6[52]={
  97 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  98 };
  99
 100 static const uint8_t left_block_options[4][8]={
 101     {0,1,2,3,7,10,8,11},
 102     {2,2,3,3,8,11,8,11},
 103     {0,0,1,1,7,10,7,10},
 104     {0,2,0,2,7,10,7,10}
 105 };
 106
 107 #define LEVEL_TAB_BITS 8
 108 static int8_t cavlc_level_tab[7][1<<LEVEL_TAB_BITS][2];
 109
 110 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 111     MpegEncContext * const s = &h->s;
 112     const int mb_xy= h->mb_xy;
 113     int topleft_xy, top_xy, topright_xy, left_xy[2];
 114     int topleft_type, top_type, topright_type, left_type[2];
 115     const uint8_t * left_block;
 116     int topleft_partition= -1;
 117     int i;
 118
 119     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
 120
 121     //FIXME deblocking could skip the intra and nnz parts.
 122     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
 123         return;
 124
 125     /* Wow, what a mess, why didn't they simplify the interlacing & intra
 126      * stuff, I can't imagine that these complex rules are worth it. */
 127
 128     topleft_xy = top_xy - 1;
 129     topright_xy= top_xy + 1;
 130     left_xy[1] = left_xy[0] = mb_xy-1;
 131     left_block = left_block_options[0];
 132     if(FRAME_MBAFF){
 133         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 134         const int top_pair_xy      = pair_xy     - s->mb_stride;
 135         const int topleft_pair_xy  = top_pair_xy - 1;
 136         const int topright_pair_xy = top_pair_xy + 1;
 137         const int topleft_mb_field_flag  = IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 138         const int top_mb_field_flag      = IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 139         const int topright_mb_field_flag = IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 140         const int left_mb_field_flag     = IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 141         const int curr_mb_field_flag     = IS_INTERLACED(mb_type);
 142         const int bottom = (s->mb_y & 1);
 143         tprintf(s->avctx, "fill_caches: curr_mb_field_flag:%d, left_mb_field_flag:%d, topleft_mb_field_flag:%d, top_mb_field_flag:%d, topright_mb_field_flag:%d\n", curr_mb_field_flag, left_mb_field_flag, topleft_mb_field_flag, top_mb_field_flag, topright_mb_field_flag);
 144
 145         if (curr_mb_field_flag && (bottom || top_mb_field_flag)){
 146             top_xy -= s->mb_stride;
 147         }
 148         if (curr_mb_field_flag && (bottom || topleft_mb_field_flag)){
 149             topleft_xy -= s->mb_stride;
 150         } else if(bottom && !curr_mb_field_flag && left_mb_field_flag) {
 151             topleft_xy += s->mb_stride;
 152             // take top left mv from the middle of the mb, as opposed to all other modes which use the bottom right partition
 153             topleft_partition = 0;
 154         }
 155         if (curr_mb_field_flag && (bottom || topright_mb_field_flag)){
 156             topright_xy -= s->mb_stride;
 157         }
 158         if (left_mb_field_flag != curr_mb_field_flag) {
 159             left_xy[1] = left_xy[0] = pair_xy - 1;
 160             if (curr_mb_field_flag) {
 161                 left_xy[1] += s->mb_stride;
 162                 left_block = left_block_options[3];
 163             } else {
 164                 left_block= left_block_options[2 - bottom];
 165             }
 166         }
 167     }
 168
 169     h->top_mb_xy = top_xy;
 170     h->left_mb_xy[0] = left_xy[0];
 171     h->left_mb_xy[1] = left_xy[1];
 172     if(for_deblock){
 173         topleft_type = 0;
 174         topright_type = 0;
 175         top_type     = h->slice_table[top_xy     ] < 0xFFFF ? s->current_picture.mb_type[top_xy]     : 0;
 176         left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0;
 177         left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0;
 178
 179         if(MB_MBAFF && !IS_INTRA(mb_type)){
 180             int list;
 181             for(list=0; list<h->list_count; list++){
 182                 //These values where changed for ease of performing MC, we need to change them back
 183                 //FIXME maybe we can make MC and loop filter use the same values or prevent
 184                 //the MC code from changing ref_cache and rather use a temporary array.
 185                 if(USES_LIST(mb_type,list)){
 186                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 187                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 188                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 189                     ref += h->b8_stride;
 190                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 191                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 192                 }
 193             }
 194         }
 195     }else{
 196         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 197         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 198         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 199         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 200         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 201
 202     if(IS_INTRA(mb_type)){
 203         int type_mask= h->pps.constrained_intra_pred ? IS_INTRA(-1) : -1;
 204         h->topleft_samples_available=
 205         h->top_samples_available=
 206         h->left_samples_available= 0xFFFF;
 207         h->topright_samples_available= 0xEEEA;
 208
 209         if(!(top_type & type_mask)){
 210             h->topleft_samples_available= 0xB3FF;
 211             h->top_samples_available= 0x33FF;
 212             h->topright_samples_available= 0x26EA;
 213         }
 214         if(IS_INTERLACED(mb_type) != IS_INTERLACED(left_type[0])){
 215             if(IS_INTERLACED(mb_type)){
 216                 if(!(left_type[0] & type_mask)){
 217                     h->topleft_samples_available&= 0xDFFF;
 218                     h->left_samples_available&= 0x5FFF;
 219                 }
 220                 if(!(left_type[1] & type_mask)){
 221                     h->topleft_samples_available&= 0xFF5F;
 222                     h->left_samples_available&= 0xFF5F;
 223                 }
 224             }else{
 225                 int left_typei = h->slice_table[left_xy[0] + s->mb_stride ] == h->slice_num
 226                                 ? s->current_picture.mb_type[left_xy[0] + s->mb_stride] : 0;
 227                 assert(left_xy[0] == left_xy[1]);
 228                 if(!((left_typei & type_mask) && (left_type[0] & type_mask))){
 229                     h->topleft_samples_available&= 0xDF5F;
 230                     h->left_samples_available&= 0x5F5F;
 231                 }
 232             }
 233         }else{
 234             if(!(left_type[0] & type_mask)){
 235                 h->topleft_samples_available&= 0xDF5F;
 236                 h->left_samples_available&= 0x5F5F;
 237             }
 238         }
 239
 240         if(!(topleft_type & type_mask))
 241             h->topleft_samples_available&= 0x7FFF;
 242
 243         if(!(topright_type & type_mask))
 244             h->topright_samples_available&= 0xFBFF;
 245
 246         if(IS_INTRA4x4(mb_type)){
 247             if(IS_INTRA4x4(top_type)){
 248                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 249                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 250                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 251                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 252             }else{
 253                 int pred;
 254                 if(!(top_type & type_mask))
 255                     pred= -1;
 256                 else{
 257                     pred= 2;
 258                 }
 259                 h->intra4x4_pred_mode_cache[4+8*0]=
 260                 h->intra4x4_pred_mode_cache[5+8*0]=
 261                 h->intra4x4_pred_mode_cache[6+8*0]=
 262                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 263             }
 264             for(i=0; i<2; i++){
 265                 if(IS_INTRA4x4(left_type[i])){
 266                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 267                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 268                 }else{
 269                     int pred;
 270                     if(!(left_type[i] & type_mask))
 271                         pred= -1;
 272                     else{
 273                         pred= 2;
 274                     }
 275                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 276                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 277                 }
 278             }
 279         }
 280     }
 281     }
 282
 283
 284 /*
 285 0 . T T. T T T T
 286 1 L . .L . . . .
 287 2 L . .L . . . .
 288 3 . T TL . . . .
 289 4 L . .L . . . .
 290 5 L . .. . . . .
 291 */
 292 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
 293     if(top_type){
 294         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 295         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 296         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 297         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 298
 299         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 300         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 301
 302         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 303         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 304
 305     }else{
 306         h->non_zero_count_cache[4+8*0]=
 307         h->non_zero_count_cache[5+8*0]=
 308         h->non_zero_count_cache[6+8*0]=
 309         h->non_zero_count_cache[7+8*0]=
 310
 311         h->non_zero_count_cache[1+8*0]=
 312         h->non_zero_count_cache[2+8*0]=
 313
 314         h->non_zero_count_cache[1+8*3]=
 315         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 316
 317     }
 318
 319     for (i=0; i<2; i++) {
 320         if(left_type[i]){
 321             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 322             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 323             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 324             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 325         }else{
 326             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 327             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 328             h->non_zero_count_cache[0+8*1 +   8*i]=
 329             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 330         }
 331     }
 332
 333     if( h->pps.cabac ) {
 334         // top_cbp
 335         if(top_type) {
 336             h->top_cbp = h->cbp_table[top_xy];
 337         } else if(IS_INTRA(mb_type)) {
 338             h->top_cbp = 0x1C0;
 339         } else {
 340             h->top_cbp = 0;
 341         }
 342         // left_cbp
 343         if (left_type[0]) {
 344             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 345         } else if(IS_INTRA(mb_type)) {
 346             h->left_cbp = 0x1C0;
 347         } else {
 348             h->left_cbp = 0;
 349         }
 350         if (left_type[0]) {
 351             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 352         }
 353         if (left_type[1]) {
 354             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 355         }
 356     }
 357
 358 #if 1
 359     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 360         int list;
 361         for(list=0; list<h->list_count; list++){
 362             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 363                 /*if(!h->mv_cache_clean[list]){
 364                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 365                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 366                     h->mv_cache_clean[list]= 1;
 367                 }*/
 368                 continue;
 369             }
 370             h->mv_cache_clean[list]= 0;
 371
 372             if(USES_LIST(top_type, list)){
 373                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 374                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 375                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 376                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 377                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 378                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 379                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 380                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 381                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 382                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 383             }else{
 384                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 385                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 386                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 387                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 388                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 389             }
 390
 391             for(i=0; i<2; i++){
 392                 int cache_idx = scan8[0] - 1 + i*2*8;
 393                 if(USES_LIST(left_type[i], list)){
 394                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 395                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 396                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 397                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 398                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 399                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 400                 }else{
 401                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 402                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 403                     h->ref_cache[list][cache_idx  ]=
 404                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 405                 }
 406             }
 407
 408             if(for_deblock || ((IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred) && !FRAME_MBAFF))
 409                 continue;
 410
 411             if(USES_LIST(topleft_type, list)){
 412                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 413                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 414                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 415                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 416             }else{
 417                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 418                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 419             }
 420
 421             if(USES_LIST(topright_type, list)){
 422                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 423                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 424                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 425                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 426             }else{
 427                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 428                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 429             }
 430
 431             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 432                 continue;
 433
 434             h->ref_cache[list][scan8[5 ]+1] =
 435             h->ref_cache[list][scan8[7 ]+1] =
 436             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 437             h->ref_cache[list][scan8[4 ]] =
 438             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 439             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 440             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 441             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 442             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 443             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 444
 445             if( h->pps.cabac ) {
 446                 /* XXX beurk, Load mvd */
 447                 if(USES_LIST(top_type, list)){
 448                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 449                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 450                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 451                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 452                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 453                 }else{
 454                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 455                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 456                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 457                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 458                 }
 459                 if(USES_LIST(left_type[0], list)){
 460                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 461                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 462                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 463                 }else{
 464                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 465                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 466                 }
 467                 if(USES_LIST(left_type[1], list)){
 468                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 469                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 470                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 471                 }else{
 472                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 473                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 474                 }
 475                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 476                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 477                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 478                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 479                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 480
 481                 if(h->slice_type_nos == FF_B_TYPE){
 482                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 483
 484                     if(IS_DIRECT(top_type)){
 485                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 486                     }else if(IS_8X8(top_type)){
 487                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 488                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 489                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 490                     }else{
 491                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 492                     }
 493
 494                     if(IS_DIRECT(left_type[0]))
 495                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 496                     else if(IS_8X8(left_type[0]))
 497                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 498                     else
 499                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 500
 501                     if(IS_DIRECT(left_type[1]))
 502                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 503                     else if(IS_8X8(left_type[1]))
 504                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 505                     else
 506                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 507                 }
 508             }
 509
 510             if(FRAME_MBAFF){
 511 #define MAP_MVS\
 512                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 513                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 514                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 515                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 516                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 517                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 518                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 519                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 520                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 521                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 522                 if(MB_FIELD){
 523 #define MAP_F2F(idx, mb_type)\
 524                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 525                         h->ref_cache[list][idx] <<= 1;\
 526                         h->mv_cache[list][idx][1] /= 2;\
 527                         h->mvd_cache[list][idx][1] /= 2;\
 528                     }
 529                     MAP_MVS
 530 #undef MAP_F2F
 531                 }else{
 532 #define MAP_F2F(idx, mb_type)\
 533                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 534                         h->ref_cache[list][idx] >>= 1;\
 535                         h->mv_cache[list][idx][1] <<= 1;\
 536                         h->mvd_cache[list][idx][1] <<= 1;\
 537                     }
 538                     MAP_MVS
 539 #undef MAP_F2F
 540                 }
 541             }
 542         }
 543     }
 544 #endif
 545
 546     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 547 }
 548
 549 static inline void write_back_intra_pred_mode(H264Context *h){
 550     const int mb_xy= h->mb_xy;
 551
 552     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 553     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 554     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 555     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 556     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 557     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 558     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 559 }
 560
 561 /**
 562  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 563  */
 564 static inline int check_intra4x4_pred_mode(H264Context *h){
 565     MpegEncContext * const s = &h->s;
 566     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 567     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 568     int i;
 569
 570     if(!(h->top_samples_available&0x8000)){
 571         for(i=0; i<4; i++){
 572             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 573             if(status<0){
 574                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 575                 return -1;
 576             } else if(status){
 577                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 578             }
 579         }
 580     }
 581
 582     if((h->left_samples_available&0x8888)!=0x8888){
 583         static const int mask[4]={0x8000,0x2000,0x80,0x20};
 584         for(i=0; i<4; i++){
 585             if(!(h->left_samples_available&mask[i])){
 586                 int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 587                 if(status<0){
 588                     av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 589                     return -1;
 590                 } else if(status){
 591                     h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 592                 }
 593             }
 594         }
 595     }
 596
 597     return 0;
 598 } //FIXME cleanup like next
 599
 600 /**
 601  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 602  */
 603 static inline int check_intra_pred_mode(H264Context *h, int mode){
 604     MpegEncContext * const s = &h->s;
 605     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 606     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 607
 608     if(mode > 6U) {
 609         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 610         return -1;
 611     }
 612
 613     if(!(h->top_samples_available&0x8000)){
 614         mode= top[ mode ];
 615         if(mode<0){
 616             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 617             return -1;
 618         }
 619     }
 620
 621     if((h->left_samples_available&0x8080) != 0x8080){
 622         mode= left[ mode ];
 623         if(h->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred
 624             mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(h->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8);
 625         }
 626         if(mode<0){
 627             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 628             return -1;
 629         }
 630     }
 631
 632     return mode;
 633 }
 634
 635 /**
 636  * gets the predicted intra4x4 prediction mode.
 637  */
 638 static inline int pred_intra_mode(H264Context *h, int n){
 639     const int index8= scan8[n];
 640     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 641     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 642     const int min= FFMIN(left, top);
 643
 644     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 645
 646     if(min<0) return DC_PRED;
 647     else      return min;
 648 }
 649
 650 static inline void write_back_non_zero_count(H264Context *h){
 651     const int mb_xy= h->mb_xy;
 652
 653     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 654     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 655     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 656     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 657     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 658     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 659     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 660
 661     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 662     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 663     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 664
 665     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 666     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 667     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 668 }
 669
 670 /**
 671  * gets the predicted number of non-zero coefficients.
 672  * @param n block index
 673  */
 674 static inline int pred_non_zero_count(H264Context *h, int n){
 675     const int index8= scan8[n];
 676     const int left= h->non_zero_count_cache[index8 - 1];
 677     const int top = h->non_zero_count_cache[index8 - 8];
 678     int i= left + top;
 679
 680     if(i<64) i= (i+1)>>1;
 681
 682     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 683
 684     return i&31;
 685 }
 686
 687 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 688     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 689     MpegEncContext *s = &h->s;
 690
 691     /* there is no consistent mapping of mvs to neighboring locations that will
 692      * make mbaff happy, so we can't move all this logic to fill_caches */
 693     if(FRAME_MBAFF){
 694         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 695         const int16_t *mv;
 696         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 697         *C = h->mv_cache[list][scan8[0]-2];
 698
 699         if(!MB_FIELD
 700            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 701             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 702             if(IS_INTERLACED(mb_types[topright_xy])){
 703 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 704                 const int x4 = X4, y4 = Y4;\
 705                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 706                 if(!USES_LIST(mb_type,list))\
 707                     return LIST_NOT_USED;\
 708                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 709                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 710                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 711                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 712
 713                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 714             }
 715         }
 716         if(topright_ref == PART_NOT_AVAILABLE
 717            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 718            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 719             if(!MB_FIELD
 720                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 721                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 722             }
 723             if(MB_FIELD
 724                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 725                && i >= scan8[0]+8){
 726                 // left shift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's OK.
 727                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 728             }
 729         }
 730 #undef SET_DIAG_MV
 731     }
 732
 733     if(topright_ref != PART_NOT_AVAILABLE){
 734         *C= h->mv_cache[list][ i - 8 + part_width ];
 735         return topright_ref;
 736     }else{
 737         tprintf(s->avctx, "topright MV not available\n");
 738
 739         *C= h->mv_cache[list][ i - 8 - 1 ];
 740         return h->ref_cache[list][ i - 8 - 1 ];
 741     }
 742 }
 743
 744 /**
 745  * gets the predicted MV.
 746  * @param n the block index
 747  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 748  * @param mx the x component of the predicted motion vector
 749  * @param my the y component of the predicted motion vector
 750  */
 751 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 752     const int index8= scan8[n];
 753     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 754     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 755     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 756     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 757     const int16_t * C;
 758     int diagonal_ref, match_count;
 759
 760     assert(part_width==1 || part_width==2 || part_width==4);
 761
 762 /* mv_cache
 763   B . . A T T T T
 764   U . . L . . , .
 765   U . . L . . . .
 766   U . . L . . , .
 767   . . . L . . . .
 768 */
 769
 770     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 771     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 772     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 773     if(match_count > 1){ //most common
 774         *mx= mid_pred(A[0], B[0], C[0]);
 775         *my= mid_pred(A[1], B[1], C[1]);
 776     }else if(match_count==1){
 777         if(left_ref==ref){
 778             *mx= A[0];
 779             *my= A[1];
 780         }else if(top_ref==ref){
 781             *mx= B[0];
 782             *my= B[1];
 783         }else{
 784             *mx= C[0];
 785             *my= C[1];
 786         }
 787     }else{
 788         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 789             *mx= A[0];
 790             *my= A[1];
 791         }else{
 792             *mx= mid_pred(A[0], B[0], C[0]);
 793             *my= mid_pred(A[1], B[1], C[1]);
 794         }
 795     }
 796
 797     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 798 }
 799
 800 /**
 801  * gets the directionally predicted 16x8 MV.
 802  * @param n the block index
 803  * @param mx the x component of the predicted motion vector
 804  * @param my the y component of the predicted motion vector
 805  */
 806 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 807     if(n==0){
 808         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 809         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 810
 811         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 812
 813         if(top_ref == ref){
 814             *mx= B[0];
 815             *my= B[1];
 816             return;
 817         }
 818     }else{
 819         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 820         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 821
 822         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 823
 824         if(left_ref == ref){
 825             *mx= A[0];
 826             *my= A[1];
 827             return;
 828         }
 829     }
 830
 831     //RARE
 832     pred_motion(h, n, 4, list, ref, mx, my);
 833 }
 834
 835 /**
 836  * gets the directionally predicted 8x16 MV.
 837  * @param n the block index
 838  * @param mx the x component of the predicted motion vector
 839  * @param my the y component of the predicted motion vector
 840  */
 841 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 842     if(n==0){
 843         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 844         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 845
 846         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 847
 848         if(left_ref == ref){
 849             *mx= A[0];
 850             *my= A[1];
 851             return;
 852         }
 853     }else{
 854         const int16_t * C;
 855         int diagonal_ref;
 856
 857         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 858
 859         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 860
 861         if(diagonal_ref == ref){
 862             *mx= C[0];
 863             *my= C[1];
 864             return;
 865         }
 866     }
 867
 868     //RARE
 869     pred_motion(h, n, 2, list, ref, mx, my);
 870 }
 871
 872 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 873     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 874     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 875
 876     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 877
 878     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 879        || !( top_ref | *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ])
 880        || !(left_ref | *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ])){
 881
 882         *mx = *my = 0;
 883         return;
 884     }
 885
 886     pred_motion(h, 0, 4, 0, 0, mx, my);
 887
 888     return;
 889 }
 890
 891 static int get_scale_factor(H264Context * const h, int poc, int poc1, int i){
 892     int poc0 = h->ref_list[0][i].poc;
 893     int td = av_clip(poc1 - poc0, -128, 127);
 894     if(td == 0 || h->ref_list[0][i].long_ref){
 895         return 256;
 896     }else{
 897         int tb = av_clip(poc - poc0, -128, 127);
 898         int tx = (16384 + (FFABS(td) >> 1)) / td;
 899         return av_clip((tb*tx + 32) >> 6, -1024, 1023);
 900     }
 901 }
 902
 903 static inline void direct_dist_scale_factor(H264Context * const h){
 904     MpegEncContext * const s = &h->s;
 905     const int poc = h->s.current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
 906     const int poc1 = h->ref_list[1][0].poc;
 907     int i, field;
 908     for(field=0; field<2; field++){
 909         const int poc  = h->s.current_picture_ptr->field_poc[field];
 910         const int poc1 = h->ref_list[1][0].field_poc[field];
 911         for(i=0; i < 2*h->ref_count[0]; i++)
 912             h->dist_scale_factor_field[field][i^field] = get_scale_factor(h, poc, poc1, i+16);
 913     }
 914
 915     for(i=0; i<h->ref_count[0]; i++){
 916         h->dist_scale_factor[i] = get_scale_factor(h, poc, poc1, i);
 917     }
 918 }
 919
 920 static void fill_colmap(H264Context *h, int map[2][16+32], int list, int field, int colfield, int mbafi){
 921     MpegEncContext * const s = &h->s;
 922     Picture * const ref1 = &h->ref_list[1][0];
 923     int j, old_ref, rfield;
 924     int start= mbafi ? 16                      : 0;
 925     int end  = mbafi ? 16+2*h->ref_count[list] : h->ref_count[list];
 926     int interl= mbafi || s->picture_structure != PICT_FRAME;
 927
 928     /* bogus; fills in for missing frames */
 929     memset(map[list], 0, sizeof(map[list]));
 930
 931     for(rfield=0; rfield<2; rfield++){
 932         for(old_ref=0; old_ref<ref1->ref_count[colfield][list]; old_ref++){
 933             int poc = ref1->ref_poc[colfield][list][old_ref];
 934
 935             if     (!interl)
 936                 poc |= 3;
 937             else if( interl && (poc&3) == 3) //FIXME store all MBAFF references so this isnt needed
 938                 poc= (poc&~3) + rfield + 1;
 939
 940             for(j=start; j<end; j++){
 941                 if(4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3) == poc){
 942                     int cur_ref= mbafi ? (j-16)^field : j;
 943                     map[list][2*old_ref + (rfield^field) + 16] = cur_ref;
 944                     if(rfield == field)
 945                         map[list][old_ref] = cur_ref;
 946                     break;
 947                 }
 948             }
 949         }
 950     }
 951 }
 952
 953 static inline void direct_ref_list_init(H264Context * const h){
 954     MpegEncContext * const s = &h->s;
 955     Picture * const ref1 = &h->ref_list[1][0];
 956     Picture * const cur = s->current_picture_ptr;
 957     int list, j, field;
 958     int sidx= (s->picture_structure&1)^1;
 959     int ref1sidx= (ref1->reference&1)^1;
 960
 961     for(list=0; list<2; list++){
 962         cur->ref_count[sidx][list] = h->ref_count[list];
 963         for(j=0; j<h->ref_count[list]; j++)
 964             cur->ref_poc[sidx][list][j] = 4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3);
 965     }
 966
 967     if(s->picture_structure == PICT_FRAME){
 968         memcpy(cur->ref_count[1], cur->ref_count[0], sizeof(cur->ref_count[0]));
 969         memcpy(cur->ref_poc  [1], cur->ref_poc  [0], sizeof(cur->ref_poc  [0]));
 970     }
 971
 972     cur->mbaff= FRAME_MBAFF;
 973
 974     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 975         return;
 976
 977     for(list=0; list<2; list++){
 978         fill_colmap(h, h->map_col_to_list0, list, sidx, ref1sidx, 0);
 979         for(field=0; field<2; field++)
 980             fill_colmap(h, h->map_col_to_list0_field[field], list, field, field, 1);
 981     }
 982 }
 983
 984 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 985     MpegEncContext * const s = &h->s;
 986     int b8_stride = h->b8_stride;
 987     int b4_stride = h->b_stride;
 988     int mb_xy = h->mb_xy;
 989     int mb_type_col[2];
 990     const int16_t (*l1mv0)[2], (*l1mv1)[2];
 991     const int8_t *l1ref0, *l1ref1;
 992     const int is_b8x8 = IS_8X8(*mb_type);
 993     unsigned int sub_mb_type;
 994     int i8, i4;
 995
 996     assert(h->ref_list[1][0].reference&3);
 997
 998 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
 999
1000     if(IS_INTERLACED(h->ref_list[1][0].mb_type[mb_xy])){ // AFL/AFR/FR/FL -> AFL/FL
1001         if(!IS_INTERLACED(*mb_type)){                    //     AFR/FR    -> AFL/FL
1002             int cur_poc = s->current_picture_ptr->poc;
1003             int *col_poc = h->ref_list[1]->field_poc;
1004             int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
1005             mb_xy= s->mb_x + ((s->mb_y&~1) + col_parity)*s->mb_stride;
1006             b8_stride = 0;
1007         }else if(!(s->picture_structure & h->ref_list[1][0].reference) && !h->ref_list[1][0].mbaff){// FL -> FL & differ parity
1008             int fieldoff= 2*(h->ref_list[1][0].reference)-3;
1009             mb_xy += s->mb_stride*fieldoff;
1010         }
1011         goto single_col;
1012     }else{                                               // AFL/AFR/FR/FL -> AFR/FR
1013         if(IS_INTERLACED(*mb_type)){                     // AFL       /FL -> AFR/FR
1014             mb_xy= s->mb_x + (s->mb_y&~1)*s->mb_stride;
1015             mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy];
1016             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride];
1017             b8_stride *= 3;
1018             b4_stride *= 6;
1019             //FIXME IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag
1020             if(    (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)
1021                 && (mb_type_col[1] & MB_TYPE_16x16_OR_INTRA)
1022                 && !is_b8x8){
1023                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1024                 *mb_type   |= MB_TYPE_16x8 |MB_TYPE_L0L1|MB_TYPE_DIRECT2; /* B_16x8 */
1025             }else{
1026                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1027                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1028             }
1029         }else{                                           //     AFR/FR    -> AFR/FR
1030 single_col:
1031             mb_type_col[0] =
1032             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy];
1033             if(IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag){
1034                 /* FIXME save sub mb types from previous frames (or derive from MVs)
1035                 * so we know exactly what block size to use */
1036                 sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1037                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1038             }else if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
1039                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1040                 *mb_type   |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1041             }else{
1042                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1043                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1044             }
1045         }
1046     }
1047
1048     l1mv0  = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]];
1049     l1mv1  = &h->ref_list[1][0].motion_val[1][h->mb2b_xy [mb_xy]];
1050     l1ref0 = &h->ref_list[1][0].ref_index [0][h->mb2b8_xy[mb_xy]];
1051     l1ref1 = &h->ref_list[1][0].ref_index [1][h->mb2b8_xy[mb_xy]];
1052     if(!b8_stride){
1053         if(s->mb_y&1){
1054             l1ref0 += h->b8_stride;
1055             l1ref1 += h->b8_stride;
1056             l1mv0  +=  2*b4_stride;
1057             l1mv1  +=  2*b4_stride;
1058         }
1059     }
1060
1061     if(h->direct_spatial_mv_pred){
1062         int ref[2];
1063         int mv[2][2];
1064         int list;
1065
1066         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1067
1068         /* ref = min(neighbors) */
1069         for(list=0; list<2; list++){
1070             int refa = h->ref_cache[list][scan8[0] - 1];
1071             int refb = h->ref_cache[list][scan8[0] - 8];
1072             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1073             if(refc == PART_NOT_AVAILABLE)
1074                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1075             ref[list] = FFMIN3((unsigned)refa, (unsigned)refb, (unsigned)refc);
1076             if(ref[list] < 0)
1077                 ref[list] = -1;
1078         }
1079
1080         if(ref[0] < 0 && ref[1] < 0){
1081             ref[0] = ref[1] = 0;
1082             mv[0][0] = mv[0][1] =
1083             mv[1][0] = mv[1][1] = 0;
1084         }else{
1085             for(list=0; list<2; list++){
1086                 if(ref[list] >= 0)
1087                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1088                 else
1089                     mv[list][0] = mv[list][1] = 0;
1090             }
1091         }
1092
1093         if(ref[1] < 0){
1094             if(!is_b8x8)
1095                 *mb_type &= ~MB_TYPE_L1;
1096             sub_mb_type &= ~MB_TYPE_L1;
1097         }else if(ref[0] < 0){
1098             if(!is_b8x8)
1099                 *mb_type &= ~MB_TYPE_L0;
1100             sub_mb_type &= ~MB_TYPE_L0;
1101         }
1102
1103         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1104             for(i8=0; i8<4; i8++){
1105                 int x8 = i8&1;
1106                 int y8 = i8>>1;
1107                 int xy8 = x8+y8*b8_stride;
1108                 int xy4 = 3*x8+y8*b4_stride;
1109                 int a=0, b=0;
1110
1111                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1112                     continue;
1113                 h->sub_mb_type[i8] = sub_mb_type;
1114
1115                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1116                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1117                 if(!IS_INTRA(mb_type_col[y8])
1118                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1119                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1120                     if(ref[0] > 0)
1121                         a= pack16to32(mv[0][0],mv[0][1]);
1122                     if(ref[1] > 0)
1123                         b= pack16to32(mv[1][0],mv[1][1]);
1124                 }else{
1125                     a= pack16to32(mv[0][0],mv[0][1]);
1126                     b= pack16to32(mv[1][0],mv[1][1]);
1127                 }
1128                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1129                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1130             }
1131         }else if(IS_16X16(*mb_type)){
1132             int a=0, b=0;
1133
1134             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1135             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1136             if(!IS_INTRA(mb_type_col[0])
1137                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1138                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1139                        && (h->x264_build>33 || !h->x264_build)))){
1140                 if(ref[0] > 0)
1141                     a= pack16to32(mv[0][0],mv[0][1]);
1142                 if(ref[1] > 0)
1143                     b= pack16to32(mv[1][0],mv[1][1]);
1144             }else{
1145                 a= pack16to32(mv[0][0],mv[0][1]);
1146                 b= pack16to32(mv[1][0],mv[1][1]);
1147             }
1148             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1149             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1150         }else{
1151             for(i8=0; i8<4; i8++){
1152                 const int x8 = i8&1;
1153                 const int y8 = i8>>1;
1154
1155                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1156                     continue;
1157                 h->sub_mb_type[i8] = sub_mb_type;
1158
1159                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1160                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1161                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1162                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1163
1164                 /* col_zero_flag */
1165                 if(!IS_INTRA(mb_type_col[0]) && (   l1ref0[x8 + y8*b8_stride] == 0
1166                                               || (l1ref0[x8 + y8*b8_stride] < 0 && l1ref1[x8 + y8*b8_stride] == 0
1167                                                   && (h->x264_build>33 || !h->x264_build)))){
1168                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*b8_stride] == 0 ? l1mv0 : l1mv1;
1169                     if(IS_SUB_8X8(sub_mb_type)){
1170                         const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1171                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1172                             if(ref[0] == 0)
1173                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1174                             if(ref[1] == 0)
1175                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1176                         }
1177                     }else
1178                     for(i4=0; i4<4; i4++){
1179                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1180                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1181                             if(ref[0] == 0)
1182                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1183                             if(ref[1] == 0)
1184                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1185                         }
1186                     }
1187                 }
1188             }
1189         }
1190     }else{ /* direct temporal mv pred */
1191         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1192         const int *dist_scale_factor = h->dist_scale_factor;
1193         int ref_offset= 0;
1194
1195         if(FRAME_MBAFF && IS_INTERLACED(*mb_type)){
1196             map_col_to_list0[0] = h->map_col_to_list0_field[s->mb_y&1][0];
1197             map_col_to_list0[1] = h->map_col_to_list0_field[s->mb_y&1][1];
1198             dist_scale_factor   =h->dist_scale_factor_field[s->mb_y&1];
1199         }
1200         if(h->ref_list[1][0].mbaff && IS_INTERLACED(mb_type_col[0]))
1201             ref_offset += 16;
1202
1203         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1204             /* FIXME assumes direct_8x8_inference == 1 */
1205             int y_shift  = 2*!IS_INTERLACED(*mb_type);
1206
1207             for(i8=0; i8<4; i8++){
1208                 const int x8 = i8&1;
1209                 const int y8 = i8>>1;
1210                 int ref0, scale;
1211                 const int16_t (*l1mv)[2]= l1mv0;
1212
1213                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1214                     continue;
1215                 h->sub_mb_type[i8] = sub_mb_type;
1216
1217                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1218                 if(IS_INTRA(mb_type_col[y8])){
1219                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1220                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1221                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1222                     continue;
1223                 }
1224
1225                 ref0 = l1ref0[x8 + y8*b8_stride];
1226                 if(ref0 >= 0)
1227                     ref0 = map_col_to_list0[0][ref0 + ref_offset];
1228                 else{
1229                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1230                     l1mv= l1mv1;
1231                 }
1232                 scale = dist_scale_factor[ref0];
1233                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1234
1235                 {
1236                     const int16_t *mv_col = l1mv[x8*3 + y8*b4_stride];
1237                     int my_col = (mv_col[1]<<y_shift)/2;
1238                     int mx = (scale * mv_col[0] + 128) >> 8;
1239                     int my = (scale * my_col + 128) >> 8;
1240                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1241                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1242                 }
1243             }
1244             return;
1245         }
1246
1247         /* one-to-one mv scaling */
1248
1249         if(IS_16X16(*mb_type)){
1250             int ref, mv0, mv1;
1251
1252             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1253             if(IS_INTRA(mb_type_col[0])){
1254                 ref=mv0=mv1=0;
1255             }else{
1256                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0] + ref_offset]
1257                                                 : map_col_to_list0[1][l1ref1[0] + ref_offset];
1258                 const int scale = dist_scale_factor[ref0];
1259                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1260                 int mv_l0[2];
1261                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1262                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1263                 ref= ref0;
1264                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1265                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1266             }
1267             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1268             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1269             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1270         }else{
1271             for(i8=0; i8<4; i8++){
1272                 const int x8 = i8&1;
1273                 const int y8 = i8>>1;
1274                 int ref0, scale;
1275                 const int16_t (*l1mv)[2]= l1mv0;
1276
1277                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1278                     continue;
1279                 h->sub_mb_type[i8] = sub_mb_type;
1280                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1281                 if(IS_INTRA(mb_type_col[0])){
1282                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1283                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1284                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1285                     continue;
1286                 }
1287
1288                 ref0 = l1ref0[x8 + y8*b8_stride] + ref_offset;
1289                 if(ref0 >= 0)
1290                     ref0 = map_col_to_list0[0][ref0];
1291                 else{
1292                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1293                     l1mv= l1mv1;
1294                 }
1295                 scale = dist_scale_factor[ref0];
1296
1297                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1298                 if(IS_SUB_8X8(sub_mb_type)){
1299                     const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1300                     int mx = (scale * mv_col[0] + 128) >> 8;
1301                     int my = (scale * mv_col[1] + 128) >> 8;
1302                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1303                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1304                 }else
1305                 for(i4=0; i4<4; i4++){
1306                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1307                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1308                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1309                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1310                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1311                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1312                 }
1313             }
1314         }
1315     }
1316 }
1317
1318 static inline void write_back_motion(H264Context *h, int mb_type){
1319     MpegEncContext * const s = &h->s;
1320     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1321     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1322     int list;
1323
1324     if(!USES_LIST(mb_type, 0))
1325         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1326
1327     for(list=0; list<h->list_count; list++){
1328         int y;
1329         if(!USES_LIST(mb_type, list))
1330             continue;
1331
1332         for(y=0; y<4; y++){
1333             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1334             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1335         }
1336         if( h->pps.cabac ) {
1337             if(IS_SKIP(mb_type))
1338                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1339             else
1340             for(y=0; y<4; y++){
1341                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1342                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1343             }
1344         }
1345
1346         {
1347             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1348             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1349             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1350             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1351             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1352         }
1353     }
1354
1355     if(h->slice_type_nos == FF_B_TYPE && h->pps.cabac){
1356         if(IS_8X8(mb_type)){
1357             uint8_t *direct_table = &h->direct_table[b8_xy];
1358             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1359             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1360             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1361         }
1362     }
1363 }
1364
1365 const uint8_t *ff_h264_decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1366     int i, si, di;
1367     uint8_t *dst;
1368     int bufidx;
1369
1370 //    src[0]&0x80;                //forbidden bit
1371     h->nal_ref_idc= src[0]>>5;
1372     h->nal_unit_type= src[0]&0x1F;
1373
1374     src++; length--;
1375 #if 0
1376     for(i=0; i<length; i++)
1377         printf("%2X ", src[i]);
1378 #endif
1379
1380 #if HAVE_FAST_UNALIGNED
1381 # if HAVE_FAST_64BIT
1382 #   define RS 7
1383     for(i=0; i+1<length; i+=9){
1384         if(!((~*(const uint64_t*)(src+i) & (*(const uint64_t*)(src+i) - 0x0100010001000101ULL)) & 0x8000800080008080ULL))
1385 # else
1386 #   define RS 3
1387     for(i=0; i+1<length; i+=5){
1388         if(!((~*(const uint32_t*)(src+i) & (*(const uint32_t*)(src+i) - 0x01000101U)) & 0x80008080U))
1389 # endif
1390             continue;
1391         if(i>0 && !src[i]) i--;
1392         while(src[i]) i++;
1393 #else
1394 #   define RS 0
1395     for(i=0; i+1<length; i+=2){
1396         if(src[i]) continue;
1397         if(i>0 && src[i-1]==0) i--;
1398 #endif
1399         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1400             if(src[i+2]!=3){
1401                 /* startcode, so we must be past the end */
1402                 length=i;
1403             }
1404             break;
1405         }
1406         i-= RS;
1407     }
1408
1409     if(i>=length-1){ //no escaped 0
1410         *dst_length= length;
1411         *consumed= length+1; //+1 for the header
1412         return src;
1413     }
1414
1415     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1416     av_fast_malloc(&h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length+FF_INPUT_BUFFER_PADDING_SIZE);
1417     dst= h->rbsp_buffer[bufidx];
1418
1419     if (dst == NULL){
1420         return NULL;
1421     }
1422
1423 //printf("decoding esc\n");
1424     memcpy(dst, src, i);
1425     si=di=i;
1426     while(si+2<length){
1427         //remove escapes (very rare 1:2^22)
1428         if(src[si+2]>3){
1429             dst[di++]= src[si++];
1430             dst[di++]= src[si++];
1431         }else if(src[si]==0 && src[si+1]==0){
1432             if(src[si+2]==3){ //escape
1433                 dst[di++]= 0;
1434                 dst[di++]= 0;
1435                 si+=3;
1436                 continue;
1437             }else //next start code
1438                 goto nsc;
1439         }
1440
1441         dst[di++]= src[si++];
1442     }
1443     while(si<length)
1444         dst[di++]= src[si++];
1445 nsc:
1446
1447     memset(dst+di, 0, FF_INPUT_BUFFER_PADDING_SIZE);
1448
1449     *dst_length= di;
1450     *consumed= si + 1;//+1 for the header
1451 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1452     return dst;
1453 }
1454
1455 int ff_h264_decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1456     int v= *src;
1457     int r;
1458
1459     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1460
1461     for(r=1; r<9; r++){
1462         if(v&1) return r;
1463         v>>=1;
1464     }
1465     return 0;
1466 }
1467
1468 /**
1469  * IDCT transforms the 16 dc values and dequantizes them.
1470  * @param qp quantization parameter
1471  */
1472 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1473 #define stride 16
1474     int i;
1475     int temp[16]; //FIXME check if this is a good idea
1476     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1477     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1478
1479 //memset(block, 64, 2*256);
1480 //return;
1481     for(i=0; i<4; i++){
1482         const int offset= y_offset[i];
1483         const int z0= block[offset+stride*0] + block[offset+stride*4];
1484         const int z1= block[offset+stride*0] - block[offset+stride*4];
1485         const int z2= block[offset+stride*1] - block[offset+stride*5];
1486         const int z3= block[offset+stride*1] + block[offset+stride*5];
1487
1488         temp[4*i+0]= z0+z3;
1489         temp[4*i+1]= z1+z2;
1490         temp[4*i+2]= z1-z2;
1491         temp[4*i+3]= z0-z3;
1492     }
1493
1494     for(i=0; i<4; i++){
1495         const int offset= x_offset[i];
1496         const int z0= temp[4*0+i] + temp[4*2+i];
1497         const int z1= temp[4*0+i] - temp[4*2+i];
1498         const int z2= temp[4*1+i] - temp[4*3+i];
1499         const int z3= temp[4*1+i] + temp[4*3+i];
1500
1501         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
1502         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1503         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1504         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1505     }
1506 }
1507
1508 #if 0
1509 /**
1510  * DCT transforms the 16 dc values.
1511  * @param qp quantization parameter ??? FIXME
1512  */
1513 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1514 //    const int qmul= dequant_coeff[qp][0];
1515     int i;
1516     int temp[16]; //FIXME check if this is a good idea
1517     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1518     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1519
1520     for(i=0; i<4; i++){
1521         const int offset= y_offset[i];
1522         const int z0= block[offset+stride*0] + block[offset+stride*4];
1523         const int z1= block[offset+stride*0] - block[offset+stride*4];
1524         const int z2= block[offset+stride*1] - block[offset+stride*5];
1525         const int z3= block[offset+stride*1] + block[offset+stride*5];
1526
1527         temp[4*i+0]= z0+z3;
1528         temp[4*i+1]= z1+z2;
1529         temp[4*i+2]= z1-z2;
1530         temp[4*i+3]= z0-z3;
1531     }
1532
1533     for(i=0; i<4; i++){
1534         const int offset= x_offset[i];
1535         const int z0= temp[4*0+i] + temp[4*2+i];
1536         const int z1= temp[4*0+i] - temp[4*2+i];
1537         const int z2= temp[4*1+i] - temp[4*3+i];
1538         const int z3= temp[4*1+i] + temp[4*3+i];
1539
1540         block[stride*0 +offset]= (z0 + z3)>>1;
1541         block[stride*2 +offset]= (z1 + z2)>>1;
1542         block[stride*8 +offset]= (z1 - z2)>>1;
1543         block[stride*10+offset]= (z0 - z3)>>1;
1544     }
1545 }
1546 #endif
1547
1548 #undef xStride
1549 #undef stride
1550
1551 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1552     const int stride= 16*2;
1553     const int xStride= 16;
1554     int a,b,c,d,e;
1555
1556     a= block[stride*0 + xStride*0];
1557     b= block[stride*0 + xStride*1];
1558     c= block[stride*1 + xStride*0];
1559     d= block[stride*1 + xStride*1];
1560
1561     e= a-b;
1562     a= a+b;
1563     b= c-d;
1564     c= c+d;
1565
1566     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1567     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1568     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1569     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1570 }
1571
1572 #if 0
1573 static void chroma_dc_dct_c(DCTELEM *block){
1574     const int stride= 16*2;
1575     const int xStride= 16;
1576     int a,b,c,d,e;
1577
1578     a= block[stride*0 + xStride*0];
1579     b= block[stride*0 + xStride*1];
1580     c= block[stride*1 + xStride*0];
1581     d= block[stride*1 + xStride*1];
1582
1583     e= a-b;
1584     a= a+b;
1585     b= c-d;
1586     c= c+d;
1587
1588     block[stride*0 + xStride*0]= (a+c);
1589     block[stride*0 + xStride*1]= (e+b);
1590     block[stride*1 + xStride*0]= (a-c);
1591     block[stride*1 + xStride*1]= (e-b);
1592 }
1593 #endif
1594
1595 /**
1596  * gets the chroma qp.
1597  */
1598 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1599     return h->pps.chroma_qp_table[t][qscale];
1600 }
1601
1602 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1603                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1604                            int src_x_offset, int src_y_offset,
1605                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1606     MpegEncContext * const s = &h->s;
1607     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1608     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1609     const int luma_xy= (mx&3) + ((my&3)<<2);
1610     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1611     uint8_t * src_cb, * src_cr;
1612     int extra_width= h->emu_edge_width;
1613     int extra_height= h->emu_edge_height;
1614     int emu=0;
1615     const int full_mx= mx>>2;
1616     const int full_my= my>>2;
1617     const int pic_width  = 16*s->mb_width;
1618     const int pic_height = 16*s->mb_height >> MB_FIELD;
1619
1620     if(mx&7) extra_width -= 3;
1621     if(my&7) extra_height -= 3;
1622
1623     if(   full_mx < 0-extra_width
1624        || full_my < 0-extra_height
1625        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1626        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1627         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1628             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1629         emu=1;
1630     }
1631
1632     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1633     if(!square){
1634         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1635     }
1636
1637     if(CONFIG_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1638
1639     if(MB_FIELD){
1640         // chroma offset when predicting from a field of opposite parity
1641         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1642         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1643     }
1644     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1645     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1646
1647     if(emu){
1648         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1649             src_cb= s->edge_emu_buffer;
1650     }
1651     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1652
1653     if(emu){
1654         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1655             src_cr= s->edge_emu_buffer;
1656     }
1657     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1658 }
1659
1660 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1661                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1662                            int x_offset, int y_offset,
1663                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1664                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1665                            int list0, int list1){
1666     MpegEncContext * const s = &h->s;
1667     qpel_mc_func *qpix_op=  qpix_put;
1668     h264_chroma_mc_func chroma_op= chroma_put;
1669
1670     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1671     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1672     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1673     x_offset += 8*s->mb_x;
1674     y_offset += 8*(s->mb_y >> MB_FIELD);
1675
1676     if(list0){
1677         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1678         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1679                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1680                            qpix_op, chroma_op);
1681
1682         qpix_op=  qpix_avg;
1683         chroma_op= chroma_avg;
1684     }
1685
1686     if(list1){
1687         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1688         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1689                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1690                            qpix_op, chroma_op);
1691     }
1692 }
1693
1694 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1695                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1696                            int x_offset, int y_offset,
1697                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1698                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1699                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1700                            int list0, int list1){
1701     MpegEncContext * const s = &h->s;
1702
1703     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1704     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1705     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1706     x_offset += 8*s->mb_x;
1707     y_offset += 8*(s->mb_y >> MB_FIELD);
1708
1709     if(list0 && list1){
1710         /* don't optimize for luma-only case, since B-frames usually
1711          * use implicit weights => chroma too. */
1712         uint8_t *tmp_cb = s->obmc_scratchpad;
1713         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1714         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1715         int refn0 = h->ref_cache[0][ scan8[n] ];
1716         int refn1 = h->ref_cache[1][ scan8[n] ];
1717
1718         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1719                     dest_y, dest_cb, dest_cr,
1720                     x_offset, y_offset, qpix_put, chroma_put);
1721         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1722                     tmp_y, tmp_cb, tmp_cr,
1723                     x_offset, y_offset, qpix_put, chroma_put);
1724
1725         if(h->use_weight == 2){
1726             int weight0 = h->implicit_weight[refn0][refn1];
1727             int weight1 = 64 - weight0;
1728             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1729             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1730             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1731         }else{
1732             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1733                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1734                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1735             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1736                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1737                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1738             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1739                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1740                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1741         }
1742     }else{
1743         int list = list1 ? 1 : 0;
1744         int refn = h->ref_cache[list][ scan8[n] ];
1745         Picture *ref= &h->ref_list[list][refn];
1746         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1747                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1748                     qpix_put, chroma_put);
1749
1750         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1751                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1752         if(h->use_weight_chroma){
1753             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1754                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1755             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1756                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1757         }
1758     }
1759 }
1760
1761 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1762                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1763                            int x_offset, int y_offset,
1764                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1765                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1766                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1767                            int list0, int list1){
1768     if((h->use_weight==2 && list0 && list1
1769         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1770        || h->use_weight==1)
1771         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1772                          x_offset, y_offset, qpix_put, chroma_put,
1773                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1774     else
1775         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1776                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1777 }
1778
1779 static inline void prefetch_motion(H264Context *h, int list){
1780     /* fetch pixels for estimated mv 4 macroblocks ahead
1781      * optimized for 64byte cache lines */
1782     MpegEncContext * const s = &h->s;
1783     const int refn = h->ref_cache[list][scan8[0]];
1784     if(refn >= 0){
1785         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1786         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1787         uint8_t **src= h->ref_list[list][refn].data;
1788         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1789         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1790         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1791         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1792     }
1793 }
1794
1795 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1796                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1797                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1798                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1799     MpegEncContext * const s = &h->s;
1800     const int mb_xy= h->mb_xy;
1801     const int mb_type= s->current_picture.mb_type[mb_xy];
1802
1803     assert(IS_INTER(mb_type));
1804
1805     prefetch_motion(h, 0);
1806
1807     if(IS_16X16(mb_type)){
1808         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1809                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1810                 &weight_op[0], &weight_avg[0],
1811                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1812     }else if(IS_16X8(mb_type)){
1813         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1814                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1815                 &weight_op[1], &weight_avg[1],
1816                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1817         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1818                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1819                 &weight_op[1], &weight_avg[1],
1820                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1821     }else if(IS_8X16(mb_type)){
1822         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1823                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1824                 &weight_op[2], &weight_avg[2],
1825                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1826         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1827                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1828                 &weight_op[2], &weight_avg[2],
1829                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1830     }else{
1831         int i;
1832
1833         assert(IS_8X8(mb_type));
1834
1835         for(i=0; i<4; i++){
1836             const int sub_mb_type= h->sub_mb_type[i];
1837             const int n= 4*i;
1838             int x_offset= (i&1)<<2;
1839             int y_offset= (i&2)<<1;
1840
1841             if(IS_SUB_8X8(sub_mb_type)){
1842                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1843                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1844                     &weight_op[3], &weight_avg[3],
1845                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1846             }else if(IS_SUB_8X4(sub_mb_type)){
1847                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1848                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1849                     &weight_op[4], &weight_avg[4],
1850                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1851                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1852                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1853                     &weight_op[4], &weight_avg[4],
1854                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1855             }else if(IS_SUB_4X8(sub_mb_type)){
1856                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1857                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1858                     &weight_op[5], &weight_avg[5],
1859                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1860                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1861                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1862                     &weight_op[5], &weight_avg[5],
1863                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1864             }else{
1865                 int j;
1866                 assert(IS_SUB_4X4(sub_mb_type));
1867                 for(j=0; j<4; j++){
1868                     int sub_x_offset= x_offset + 2*(j&1);
1869                     int sub_y_offset= y_offset +   (j&2);
1870                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1871                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1872                         &weight_op[6], &weight_avg[6],
1873                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1874                 }
1875             }
1876         }
1877     }
1878
1879     prefetch_motion(h, 1);
1880 }
1881
1882 static av_cold void init_cavlc_level_tab(void){
1883     int suffix_length, mask;
1884     unsigned int i;
1885
1886     for(suffix_length=0; suffix_length<7; suffix_length++){
1887         for(i=0; i<(1<<LEVEL_TAB_BITS); i++){
1888             int prefix= LEVEL_TAB_BITS - av_log2(2*i);
1889             int level_code= (prefix<<suffix_length) + (i>>(LEVEL_TAB_BITS-prefix-1-suffix_length)) - (1<<suffix_length);
1890
1891             mask= -(level_code&1);
1892             level_code= (((2+level_code)>>1) ^ mask) - mask;
1893             if(prefix + 1 + suffix_length <= LEVEL_TAB_BITS){
1894                 cavlc_level_tab[suffix_length][i][0]= level_code;
1895                 cavlc_level_tab[suffix_length][i][1]= prefix + 1 + suffix_length;
1896             }else if(prefix + 1 <= LEVEL_TAB_BITS){
1897                 cavlc_level_tab[suffix_length][i][0]= prefix+100;
1898                 cavlc_level_tab[suffix_length][i][1]= prefix + 1;
1899             }else{
1900                 cavlc_level_tab[suffix_length][i][0]= LEVEL_TAB_BITS+100;
1901                 cavlc_level_tab[suffix_length][i][1]= LEVEL_TAB_BITS;
1902             }
1903         }
1904     }
1905 }
1906
1907 static av_cold void decode_init_vlc(void){
1908     static int done = 0;
1909
1910     if (!done) {
1911         int i;
1912         int offset;
1913         done = 1;
1914
1915         chroma_dc_coeff_token_vlc.table = chroma_dc_coeff_token_vlc_table;
1916         chroma_dc_coeff_token_vlc.table_allocated = chroma_dc_coeff_token_vlc_table_size;
1917         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1918                  &chroma_dc_coeff_token_len [0], 1, 1,
1919                  &chroma_dc_coeff_token_bits[0], 1, 1,
1920                  INIT_VLC_USE_NEW_STATIC);
1921
1922         offset = 0;
1923         for(i=0; i<4; i++){
1924             coeff_token_vlc[i].table = coeff_token_vlc_tables+offset;
1925             coeff_token_vlc[i].table_allocated = coeff_token_vlc_tables_size[i];
1926             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1927                      &coeff_token_len [i][0], 1, 1,
1928                      &coeff_token_bits[i][0], 1, 1,
1929                      INIT_VLC_USE_NEW_STATIC);
1930             offset += coeff_token_vlc_tables_size[i];
1931         }
1932         /*
1933          * This is a one time safety check to make sure that
1934          * the packed static coeff_token_vlc table sizes
1935          * were initialized correctly.
1936          */
1937         assert(offset == FF_ARRAY_ELEMS(coeff_token_vlc_tables));
1938
1939         for(i=0; i<3; i++){
1940             chroma_dc_total_zeros_vlc[i].table = chroma_dc_total_zeros_vlc_tables[i];
1941             chroma_dc_total_zeros_vlc[i].table_allocated = chroma_dc_total_zeros_vlc_tables_size;
1942             init_vlc(&chroma_dc_total_zeros_vlc[i],
1943                      CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1944                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1945                      &chroma_dc_total_zeros_bits[i][0], 1, 1,
1946                      INIT_VLC_USE_NEW_STATIC);
1947         }
1948         for(i=0; i<15; i++){
1949             total_zeros_vlc[i].table = total_zeros_vlc_tables[i];
1950             total_zeros_vlc[i].table_allocated = total_zeros_vlc_tables_size;
1951             init_vlc(&total_zeros_vlc[i],
1952                      TOTAL_ZEROS_VLC_BITS, 16,
1953                      &total_zeros_len [i][0], 1, 1,
1954                      &total_zeros_bits[i][0], 1, 1,
1955                      INIT_VLC_USE_NEW_STATIC);
1956         }
1957
1958         for(i=0; i<6; i++){
1959             run_vlc[i].table = run_vlc_tables[i];
1960             run_vlc[i].table_allocated = run_vlc_tables_size;
1961             init_vlc(&run_vlc[i],
1962                      RUN_VLC_BITS, 7,
1963                      &run_len [i][0], 1, 1,
1964                      &run_bits[i][0], 1, 1,
1965                      INIT_VLC_USE_NEW_STATIC);
1966         }
1967         run7_vlc.table = run7_vlc_table,
1968         run7_vlc.table_allocated = run7_vlc_table_size;
1969         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
1970                  &run_len [6][0], 1, 1,
1971                  &run_bits[6][0], 1, 1,
1972                  INIT_VLC_USE_NEW_STATIC);
1973
1974         init_cavlc_level_tab();
1975     }
1976 }
1977
1978 static void free_tables(H264Context *h){
1979     int i;
1980     H264Context *hx;
1981     av_freep(&h->intra4x4_pred_mode);
1982     av_freep(&h->chroma_pred_mode_table);
1983     av_freep(&h->cbp_table);
1984     av_freep(&h->mvd_table[0]);
1985     av_freep(&h->mvd_table[1]);
1986     av_freep(&h->direct_table);
1987     av_freep(&h->non_zero_count);
1988     av_freep(&h->slice_table_base);
1989     h->slice_table= NULL;
1990
1991     av_freep(&h->mb2b_xy);
1992     av_freep(&h->mb2b8_xy);
1993
1994     for(i = 0; i < MAX_THREADS; i++) {
1995         hx = h->thread_context[i];
1996         if(!hx) continue;
1997         av_freep(&hx->top_borders[1]);
1998         av_freep(&hx->top_borders[0]);
1999         av_freep(&hx->s.obmc_scratchpad);
2000     }
2001 }
2002
2003 static void init_dequant8_coeff_table(H264Context *h){
2004     int i,q,x;
2005     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2006     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2007     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2008
2009     for(i=0; i<2; i++ ){
2010         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2011             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2012             break;
2013         }
2014
2015         for(q=0; q<52; q++){
2016             int shift = div6[q];
2017             int idx = rem6[q];
2018             for(x=0; x<64; x++)
2019                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2020                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2021                     h->pps.scaling_matrix8[i][x]) << shift;
2022         }
2023     }
2024 }
2025
2026 static void init_dequant4_coeff_table(H264Context *h){
2027     int i,j,q,x;
2028     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2029     for(i=0; i<6; i++ ){
2030         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2031         for(j=0; j<i; j++){
2032             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2033                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2034                 break;
2035             }
2036         }
2037         if(j<i)
2038             continue;
2039
2040         for(q=0; q<52; q++){
2041             int shift = div6[q] + 2;
2042             int idx = rem6[q];
2043             for(x=0; x<16; x++)
2044                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2045                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2046                     h->pps.scaling_matrix4[i][x]) << shift;
2047         }
2048     }
2049 }
2050
2051 static void init_dequant_tables(H264Context *h){
2052     int i,x;
2053     init_dequant4_coeff_table(h);
2054     if(h->pps.transform_8x8_mode)
2055         init_dequant8_coeff_table(h);
2056     if(h->sps.transform_bypass){
2057         for(i=0; i<6; i++)
2058             for(x=0; x<16; x++)
2059                 h->dequant4_coeff[i][0][x] = 1<<6;
2060         if(h->pps.transform_8x8_mode)
2061             for(i=0; i<2; i++)
2062                 for(x=0; x<64; x++)
2063                     h->dequant8_coeff[i][0][x] = 1<<6;
2064     }
2065 }
2066
2067
2068 /**
2069  * allocates tables.
2070  * needs width/height
2071  */
2072 static int alloc_tables(H264Context *h){
2073     MpegEncContext * const s = &h->s;
2074     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2075     int x,y;
2076
2077     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2078
2079     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2080     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(*h->slice_table_base))
2081     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2082
2083     CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2084     CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2085     CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2086     CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2087
2088     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(*h->slice_table_base));
2089     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2090
2091     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2092     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2093     for(y=0; y<s->mb_height; y++){
2094         for(x=0; x<s->mb_width; x++){
2095             const int mb_xy= x + y*s->mb_stride;
2096             const int b_xy = 4*x + 4*y*h->b_stride;
2097             const int b8_xy= 2*x + 2*y*h->b8_stride;
2098
2099             h->mb2b_xy [mb_xy]= b_xy;
2100             h->mb2b8_xy[mb_xy]= b8_xy;
2101         }
2102     }
2103
2104     s->obmc_scratchpad = NULL;
2105
2106     if(!h->dequant4_coeff[0])
2107         init_dequant_tables(h);
2108
2109     return 0;
2110 fail:
2111     free_tables(h);
2112     return -1;
2113 }
2114
2115 /**
2116  * Mimic alloc_tables(), but for every context thread.
2117  */
2118 static void clone_tables(H264Context *dst, H264Context *src){
2119     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2120     dst->non_zero_count           = src->non_zero_count;
2121     dst->slice_table              = src->slice_table;
2122     dst->cbp_table                = src->cbp_table;
2123     dst->mb2b_xy                  = src->mb2b_xy;
2124     dst->mb2b8_xy                 = src->mb2b8_xy;
2125     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2126     dst->mvd_table[0]             = src->mvd_table[0];
2127     dst->mvd_table[1]             = src->mvd_table[1];
2128     dst->direct_table             = src->direct_table;
2129
2130     dst->s.obmc_scratchpad = NULL;
2131     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2132 }
2133
2134 /**
2135  * Init context
2136  * Allocate buffers which are not shared amongst multiple threads.
2137  */
2138 static int context_init(H264Context *h){
2139     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2140     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2141
2142     return 0;
2143 fail:
2144     return -1; // free_tables will clean up for us
2145 }
2146
2147 static av_cold void common_init(H264Context *h){
2148     MpegEncContext * const s = &h->s;
2149
2150     s->width = s->avctx->width;
2151     s->height = s->avctx->height;
2152     s->codec_id= s->avctx->codec->id;
2153
2154     ff_h264_pred_init(&h->hpc, s->codec_id);
2155
2156     h->dequant_coeff_pps= -1;
2157     s->unrestricted_mv=1;
2158     s->decode=1; //FIXME
2159
2160     dsputil_init(&s->dsp, s->avctx); // needed so that idct permutation is known early
2161
2162     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2163     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2164 }
2165
2166 /**
2167  * Reset SEI values at the beginning of the frame.
2168  *
2169  * @param h H.264 context.
2170  */
2171 static void reset_sei(H264Context *h) {
2172     h->sei_recovery_frame_cnt       = -1;
2173     h->sei_dpb_output_delay         =  0;
2174     h->sei_cpb_removal_delay        = -1;
2175     h->sei_buffering_period_present =  0;
2176 }
2177
2178 static av_cold int decode_init(AVCodecContext *avctx){
2179     H264Context *h= avctx->priv_data;
2180     MpegEncContext * const s = &h->s;
2181
2182     MPV_decode_defaults(s);
2183
2184     s->avctx = avctx;
2185     common_init(h);
2186
2187     s->out_format = FMT_H264;
2188     s->workaround_bugs= avctx->workaround_bugs;
2189
2190     // set defaults
2191 //    s->decode_mb= ff_h263_decode_mb;
2192     s->quarter_sample = 1;
2193     if(!avctx->has_b_frames)
2194     s->low_delay= 1;
2195
2196     if(s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
2197         avctx->pix_fmt= PIX_FMT_VDPAU_H264;
2198     else
2199         avctx->pix_fmt= avctx->get_format(avctx, avctx->codec->pix_fmts);
2200     avctx->hwaccel = ff_find_hwaccel(avctx->codec->id, avctx->pix_fmt);
2201     avctx->chroma_sample_location = AVCHROMA_LOC_LEFT;
2202
2203     decode_init_vlc();
2204
2205     if(avctx->extradata_size > 0 && avctx->extradata &&
2206        *(char *)avctx->extradata == 1){
2207         h->is_avc = 1;
2208         h->got_avcC = 0;
2209     } else {
2210         h->is_avc = 0;
2211     }
2212
2213     h->thread_context[0] = h;
2214     h->outputed_poc = INT_MIN;
2215     h->prev_poc_msb= 1<<16;
2216     reset_sei(h);
2217     if(avctx->codec_id == CODEC_ID_H264){
2218         if(avctx->ticks_per_frame == 1){
2219             s->avctx->time_base.den *=2;
2220         }
2221         avctx->ticks_per_frame = 2;
2222     }
2223     return 0;
2224 }
2225
2226 static int frame_start(H264Context *h){
2227     MpegEncContext * const s = &h->s;
2228     int i;
2229
2230     if(MPV_frame_start(s, s->avctx) < 0)
2231         return -1;
2232     ff_er_frame_start(s);
2233     /*
2234      * MPV_frame_start uses pict_type to derive key_frame.
2235      * This is incorrect for H.264; IDR markings must be used.
2236      * Zero here; IDR markings per slice in frame or fields are ORed in later.
2237      * See decode_nal_units().
2238      */
2239     s->current_picture_ptr->key_frame= 0;
2240
2241     assert(s->linesize && s->uvlinesize);
2242
2243     for(i=0; i<16; i++){
2244         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2245         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2246     }
2247     for(i=0; i<4; i++){
2248         h->block_offset[16+i]=
2249         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2250         h->block_offset[24+16+i]=
2251         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2252     }
2253
2254     /* can't be in alloc_tables because linesize isn't known there.
2255      * FIXME: redo bipred weight to not require extra buffer? */
2256     for(i = 0; i < s->avctx->thread_count; i++)
2257         if(!h->thread_context[i]->s.obmc_scratchpad)
2258             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2259
2260     /* some macroblocks will be accessed before they're available */
2261     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2262         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(*h->slice_table));
2263
2264 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2265
2266     // We mark the current picture as non-reference after allocating it, so
2267     // that if we break out due to an error it can be released automatically
2268     // in the next MPV_frame_start().
2269     // SVQ3 as well as most other codecs have only last/next/current and thus
2270     // get released even with set reference, besides SVQ3 and others do not
2271     // mark frames as reference later "naturally".
2272     if(s->codec_id != CODEC_ID_SVQ3)
2273         s->current_picture_ptr->reference= 0;
2274
2275     s->current_picture_ptr->field_poc[0]=
2276     s->current_picture_ptr->field_poc[1]= INT_MAX;
2277     assert(s->current_picture_ptr->long_ref==0);
2278
2279     return 0;
2280 }
2281
2282 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2283     MpegEncContext * const s = &h->s;
2284     int i;
2285     int step    = 1;
2286     int offset  = 1;
2287     int uvoffset= 1;
2288     int top_idx = 1;
2289     int skiplast= 0;
2290
2291     src_y  -=   linesize;
2292     src_cb -= uvlinesize;
2293     src_cr -= uvlinesize;
2294
2295     if(!simple && FRAME_MBAFF){
2296         if(s->mb_y&1){
2297             offset  = MB_MBAFF ? 1 : 17;
2298             uvoffset= MB_MBAFF ? 1 : 9;
2299             if(!MB_MBAFF){
2300                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 0)= *(uint64_t*)(src_y +  15*linesize);
2301                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 8)= *(uint64_t*)(src_y +8+15*linesize);
2302                 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2303                     *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+7*uvlinesize);
2304                     *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+7*uvlinesize);
2305                 }
2306             }
2307         }else{
2308             if(!MB_MBAFF){
2309                 h->left_border[0]= h->top_borders[0][s->mb_x][15];
2310                 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2311                     h->left_border[34   ]= h->top_borders[0][s->mb_x][16+7  ];
2312                     h->left_border[34+18]= h->top_borders[0][s->mb_x][16+8+7];
2313                 }
2314                 skiplast= 1;
2315             }
2316             offset  =
2317             uvoffset=
2318             top_idx = MB_MBAFF ? 0 : 1;
2319         }
2320         step= MB_MBAFF ? 2 : 1;
2321     }
2322
2323     // There are two lines saved, the line above the the top macroblock of a pair,
2324     // and the line above the bottom macroblock
2325     h->left_border[offset]= h->top_borders[top_idx][s->mb_x][15];
2326     for(i=1; i<17 - skiplast; i++){
2327         h->left_border[offset+i*step]= src_y[15+i*  linesize];
2328     }
2329
2330     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2331     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2332
2333     if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2334         h->left_border[uvoffset+34   ]= h->top_borders[top_idx][s->mb_x][16+7];
2335         h->left_border[uvoffset+34+18]= h->top_borders[top_idx][s->mb_x][24+7];
2336         for(i=1; i<9 - skiplast; i++){
2337             h->left_border[uvoffset+34   +i*step]= src_cb[7+i*uvlinesize];
2338             h->left_border[uvoffset+34+18+i*step]= src_cr[7+i*uvlinesize];
2339         }
2340         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2341         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2342     }
2343 }
2344
2345 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2346     MpegEncContext * const s = &h->s;
2347     int temp8, i;
2348     uint64_t temp64;
2349     int deblock_left;
2350     int deblock_top;
2351     int mb_xy;
2352     int step    = 1;
2353     int offset  = 1;
2354     int uvoffset= 1;
2355     int top_idx = 1;
2356
2357     if(!simple && FRAME_MBAFF){
2358         if(s->mb_y&1){
2359             offset  = MB_MBAFF ? 1 : 17;
2360             uvoffset= MB_MBAFF ? 1 : 9;
2361         }else{
2362             offset  =
2363             uvoffset=
2364             top_idx = MB_MBAFF ? 0 : 1;
2365         }
2366         step= MB_MBAFF ? 2 : 1;
2367     }
2368
2369     if(h->deblocking_filter == 2) {
2370         mb_xy = h->mb_xy;
2371         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2372         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2373     } else {
2374         deblock_left = (s->mb_x > 0);
2375         deblock_top =  (s->mb_y > !!MB_FIELD);
2376     }
2377
2378     src_y  -=   linesize + 1;
2379     src_cb -= uvlinesize + 1;
2380     src_cr -= uvlinesize + 1;
2381
2382 #define XCHG(a,b,t,xchg)\
2383 t= a;\
2384 if(xchg)\
2385     a= b;\
2386 b= t;
2387
2388     if(deblock_left){
2389         for(i = !deblock_top; i<16; i++){
2390             XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, xchg);
2391         }
2392         XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, 1);
2393     }
2394
2395     if(deblock_top){
2396         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2397         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2398         if(s->mb_x+1 < s->mb_width){
2399             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2400         }
2401     }
2402
2403     if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2404         if(deblock_left){
2405             for(i = !deblock_top; i<8; i++){
2406                 XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, xchg);
2407                 XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, xchg);
2408             }
2409             XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, 1);
2410             XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, 1);
2411         }
2412         if(deblock_top){
2413             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2414             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2415         }
2416     }
2417 }
2418
2419 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2420     MpegEncContext * const s = &h->s;
2421     const int mb_x= s->mb_x;
2422     const int mb_y= s->mb_y;
2423     const int mb_xy= h->mb_xy;
2424     const int mb_type= s->current_picture.mb_type[mb_xy];
2425     uint8_t  *dest_y, *dest_cb, *dest_cr;
2426     int linesize, uvlinesize /*dct_offset*/;
2427     int i;
2428     int *block_offset = &h->block_offset[0];
2429     const int transform_bypass = !simple && (s->qscale == 0 && h->sps.transform_bypass);
2430     /* is_h264 should always be true if SVQ3 is disabled. */
2431     const int is_h264 = !CONFIG_SVQ3_DECODER || simple || s->codec_id == CODEC_ID_H264;
2432     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2433     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2434
2435     dest_y  = s->current_picture.data[0] + (mb_x + mb_y * s->linesize  ) * 16;
2436     dest_cb = s->current_picture.data[1] + (mb_x + mb_y * s->uvlinesize) * 8;
2437     dest_cr = s->current_picture.data[2] + (mb_x + mb_y * s->uvlinesize) * 8;
2438
2439     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2440     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2441
2442     if (!simple && MB_FIELD) {
2443         linesize   = h->mb_linesize   = s->linesize * 2;
2444         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2445         block_offset = &h->block_offset[24];
2446         if(mb_y&1){ //FIXME move out of this function?
2447             dest_y -= s->linesize*15;
2448             dest_cb-= s->uvlinesize*7;
2449             dest_cr-= s->uvlinesize*7;
2450         }
2451         if(FRAME_MBAFF) {
2452             int list;
2453             for(list=0; list<h->list_count; list++){
2454                 if(!USES_LIST(mb_type, list))
2455                     continue;
2456                 if(IS_16X16(mb_type)){
2457                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2458                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2459                 }else{
2460                     for(i=0; i<16; i+=4){
2461                         int ref = h->ref_cache[list][scan8[i]];
2462                         if(ref >= 0)
2463                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2464                     }
2465                 }
2466             }
2467         }
2468     } else {
2469         linesize   = h->mb_linesize   = s->linesize;
2470         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2471 //        dct_offset = s->linesize * 16;
2472     }
2473
2474     if (!simple && IS_INTRA_PCM(mb_type)) {
2475         for (i=0; i<16; i++) {
2476             memcpy(dest_y + i*  linesize, h->mb       + i*8, 16);
2477         }
2478         for (i=0; i<8; i++) {
2479             memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
2480             memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
2481         }
2482     } else {
2483         if(IS_INTRA(mb_type)){
2484             if(h->deblocking_filter)
2485                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2486
2487             if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2488                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2489                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2490             }
2491
2492             if(IS_INTRA4x4(mb_type)){
2493                 if(simple || !s->encoding){
2494                     if(IS_8x8DCT(mb_type)){
2495                         if(transform_bypass){
2496                             idct_dc_add =
2497                             idct_add    = s->dsp.add_pixels8;
2498                         }else{
2499                             idct_dc_add = s->dsp.h264_idct8_dc_add;
2500                             idct_add    = s->dsp.h264_idct8_add;
2501                         }
2502                         for(i=0; i<16; i+=4){
2503                             uint8_t * const ptr= dest_y + block_offset[i];
2504                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2505                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2506                                 h->hpc.pred8x8l_add[dir](ptr, h->mb + i*16, linesize);
2507                             }else{
2508                                 const int nnz = h->non_zero_count_cache[ scan8[i] ];
2509                                 h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2510                                                             (h->topright_samples_available<<i)&0x4000, linesize);
2511                                 if(nnz){
2512                                     if(nnz == 1 && h->mb[i*16])
2513                                         idct_dc_add(ptr, h->mb + i*16, linesize);
2514                                     else
2515                                         idct_add   (ptr, h->mb + i*16, linesize);
2516                                 }
2517                             }
2518                         }
2519                     }else{
2520                         if(transform_bypass){
2521                             idct_dc_add =
2522                             idct_add    = s->dsp.add_pixels4;
2523                         }else{
2524                             idct_dc_add = s->dsp.h264_idct_dc_add;
2525                             idct_add    = s->dsp.h264_idct_add;
2526                         }
2527                         for(i=0; i<16; i++){
2528                             uint8_t * const ptr= dest_y + block_offset[i];
2529                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2530
2531                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2532                                 h->hpc.pred4x4_add[dir](ptr, h->mb + i*16, linesize);
2533                             }else{
2534                                 uint8_t *topright;
2535                                 int nnz, tr;
2536                                 if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2537                                     const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2538                                     assert(mb_y || linesize <= block_offset[i]);
2539                                     if(!topright_avail){
2540                                         tr= ptr[3 - linesize]*0x01010101;
2541                                         topright= (uint8_t*) &tr;
2542                                     }else
2543                                         topright= ptr + 4 - linesize;
2544                                 }else
2545                                     topright= NULL;
2546
2547                                 h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2548                                 nnz = h->non_zero_count_cache[ scan8[i] ];
2549                                 if(nnz){
2550                                     if(is_h264){
2551                                         if(nnz == 1 && h->mb[i*16])
2552                                             idct_dc_add(ptr, h->mb + i*16, linesize);
2553                                         else
2554                                             idct_add   (ptr, h->mb + i*16, linesize);
2555                                     }else
2556                                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2557                                 }
2558                             }
2559                         }
2560                     }
2561                 }
2562             }else{
2563                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2564                 if(is_h264){
2565                     if(!transform_bypass)
2566                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2567                 }else
2568                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2569             }
2570             if(h->deblocking_filter)
2571                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2572         }else if(is_h264){
2573             hl_motion(h, dest_y, dest_cb, dest_cr,
2574                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2575                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2576                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2577         }
2578
2579
2580         if(!IS_INTRA4x4(mb_type)){
2581             if(is_h264){
2582                 if(IS_INTRA16x16(mb_type)){
2583                     if(transform_bypass){
2584                         if(h->sps.profile_idc==244 && (h->intra16x16_pred_mode==VERT_PRED8x8 || h->intra16x16_pred_mode==HOR_PRED8x8)){
2585                             h->hpc.pred16x16_add[h->intra16x16_pred_mode](dest_y, block_offset, h->mb, linesize);
2586                         }else{
2587                             for(i=0; i<16; i++){
2588                                 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2589                                     s->dsp.add_pixels4(dest_y + block_offset[i], h->mb + i*16, linesize);
2590                             }
2591                         }
2592                     }else{
2593                          s->dsp.h264_idct_add16intra(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2594                     }
2595                 }else if(h->cbp&15){
2596                     if(transform_bypass){
2597                         const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2598                         idct_add= IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2599                         for(i=0; i<16; i+=di){
2600                             if(h->non_zero_count_cache[ scan8[i] ]){
2601                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2602                             }
2603                         }
2604                     }else{
2605                         if(IS_8x8DCT(mb_type)){
2606                             s->dsp.h264_idct8_add4(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2607                         }else{
2608                             s->dsp.h264_idct_add16(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2609                         }
2610                     }
2611                 }
2612             }else{
2613                 for(i=0; i<16; i++){
2614                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2615                         uint8_t * const ptr= dest_y + block_offset[i];
2616                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2617                     }
2618                 }
2619             }
2620         }
2621
2622         if((simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)) && (h->cbp&0x30)){
2623             uint8_t *dest[2] = {dest_cb, dest_cr};
2624             if(transform_bypass){
2625                 if(IS_INTRA(mb_type) && h->sps.profile_idc==244 && (h->chroma_pred_mode==VERT_PRED8x8 || h->chroma_pred_mode==HOR_PRED8x8)){
2626                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0], block_offset + 16, h->mb + 16*16, uvlinesize);
2627                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1], block_offset + 20, h->mb + 20*16, uvlinesize);
2628                 }else{
2629                     idct_add = s->dsp.add_pixels4;
2630                     for(i=16; i<16+8; i++){
2631                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2632                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2633                     }
2634                 }
2635             }else{
2636                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2637                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2638                 if(is_h264){
2639                     idct_add = s->dsp.h264_idct_add;
2640                     idct_dc_add = s->dsp.h264_idct_dc_add;
2641                     for(i=16; i<16+8; i++){
2642                         if(h->non_zero_count_cache[ scan8[i] ])
2643                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2644                         else if(h->mb[i*16])
2645                             idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2646                     }
2647                 }else{
2648                     for(i=16; i<16+8; i++){
2649                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2650                             uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2651                             svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2652                         }
2653                     }
2654                 }
2655             }
2656         }
2657     }
2658     if(h->cbp || IS_INTRA(mb_type))
2659         s->dsp.clear_blocks(h->mb);
2660
2661     if(h->deblocking_filter) {
2662         backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2663         fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2664         h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2665         h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2666         if (!simple && FRAME_MBAFF) {
2667             filter_mb     (h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2668         } else {
2669             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2670         }
2671     }
2672 }
2673
2674 /**
2675  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2676  */
2677 static void hl_decode_mb_simple(H264Context *h){
2678     hl_decode_mb_internal(h, 1);
2679 }
2680
2681 /**
2682  * Process a macroblock; this handles edge cases, such as interlacing.
2683  */
2684 static void av_noinline hl_decode_mb_complex(H264Context *h){
2685     hl_decode_mb_internal(h, 0);
2686 }
2687
2688 static void hl_decode_mb(H264Context *h){
2689     MpegEncContext * const s = &h->s;
2690     const int mb_xy= h->mb_xy;
2691     const int mb_type= s->current_picture.mb_type[mb_xy];
2692     int is_complex = CONFIG_SMALL || h->is_complex || IS_INTRA_PCM(mb_type) || s->qscale == 0;
2693
2694     if (is_complex)
2695         hl_decode_mb_complex(h);
2696     else hl_decode_mb_simple(h);
2697 }
2698
2699 static void pic_as_field(Picture *pic, const int parity){
2700     int i;
2701     for (i = 0; i < 4; ++i) {
2702         if (parity == PICT_BOTTOM_FIELD)
2703             pic->data[i] += pic->linesize[i];
2704         pic->reference = parity;
2705         pic->linesize[i] *= 2;
2706     }
2707     pic->poc= pic->field_poc[parity == PICT_BOTTOM_FIELD];
2708 }
2709
2710 static int split_field_copy(Picture *dest, Picture *src,
2711                             int parity, int id_add){
2712     int match = !!(src->reference & parity);
2713
2714     if (match) {
2715         *dest = *src;
2716         if(parity != PICT_FRAME){
2717             pic_as_field(dest, parity);
2718             dest->pic_id *= 2;
2719             dest->pic_id += id_add;
2720         }
2721     }
2722
2723     return match;
2724 }
2725
2726 static int build_def_list(Picture *def, Picture **in, int len, int is_long, int sel){
2727     int i[2]={0};
2728     int index=0;
2729
2730     while(i[0]<len || i[1]<len){
2731         while(i[0]<len && !(in[ i[0] ] && (in[ i[0] ]->reference & sel)))
2732             i[0]++;
2733         while(i[1]<len && !(in[ i[1] ] && (in[ i[1] ]->reference & (sel^3))))
2734             i[1]++;
2735         if(i[0] < len){
2736             in[ i[0] ]->pic_id= is_long ? i[0] : in[ i[0] ]->frame_num;
2737             split_field_copy(&def[index++], in[ i[0]++ ], sel  , 1);
2738         }
2739         if(i[1] < len){
2740             in[ i[1] ]->pic_id= is_long ? i[1] : in[ i[1] ]->frame_num;
2741             split_field_copy(&def[index++], in[ i[1]++ ], sel^3, 0);
2742         }
2743     }
2744
2745     return index;
2746 }
2747
2748 static int add_sorted(Picture **sorted, Picture **src, int len, int limit, int dir){
2749     int i, best_poc;
2750     int out_i= 0;
2751
2752     for(;;){
2753         best_poc= dir ? INT_MIN : INT_MAX;
2754
2755         for(i=0; i<len; i++){
2756             const int poc= src[i]->poc;
2757             if(((poc > limit) ^ dir) && ((poc < best_poc) ^ dir)){
2758                 best_poc= poc;
2759                 sorted[out_i]= src[i];
2760             }
2761         }
2762         if(best_poc == (dir ? INT_MIN : INT_MAX))
2763             break;
2764         limit= sorted[out_i++]->poc - dir;
2765     }
2766     return out_i;
2767 }
2768
2769 /**
2770  * fills the default_ref_list.
2771  */
2772 static int fill_default_ref_list(H264Context *h){
2773     MpegEncContext * const s = &h->s;
2774     int i, len;
2775
2776     if(h->slice_type_nos==FF_B_TYPE){
2777         Picture *sorted[32];
2778         int cur_poc, list;
2779         int lens[2];
2780
2781         if(FIELD_PICTURE)
2782             cur_poc= s->current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
2783         else
2784             cur_poc= s->current_picture_ptr->poc;
2785
2786         for(list= 0; list<2; list++){
2787             len= add_sorted(sorted    , h->short_ref, h->short_ref_count, cur_poc, 1^list);
2788             len+=add_sorted(sorted+len, h->short_ref, h->short_ref_count, cur_poc, 0^list);
2789             assert(len<=32);
2790             len= build_def_list(h->default_ref_list[list]    , sorted     , len, 0, s->picture_structure);
2791             len+=build_def_list(h->default_ref_list[list]+len, h->long_ref, 16 , 1, s->picture_structure);
2792             assert(len<=32);
2793
2794             if(len < h->ref_count[list])
2795                 memset(&h->default_ref_list[list][len], 0, sizeof(Picture)*(h->ref_count[list] - len));
2796             lens[list]= len;
2797         }
2798
2799         if(lens[0] == lens[1] && lens[1] > 1){
2800             for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0] && i<lens[0]; i++);
2801             if(i == lens[0])
2802                 FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2803         }
2804     }else{
2805         len = build_def_list(h->default_ref_list[0]    , h->short_ref, h->short_ref_count, 0, s->picture_structure);
2806         len+= build_def_list(h->default_ref_list[0]+len, h-> long_ref, 16                , 1, s->picture_structure);
2807         assert(len <= 32);
2808         if(len < h->ref_count[0])
2809             memset(&h->default_ref_list[0][len], 0, sizeof(Picture)*(h->ref_count[0] - len));
2810     }
2811 #ifdef TRACE
2812     for (i=0; i<h->ref_count[0]; i++) {
2813         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2814     }
2815     if(h->slice_type_nos==FF_B_TYPE){
2816         for (i=0; i<h->ref_count[1]; i++) {
2817             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
2818         }
2819     }
2820 #endif
2821     return 0;
2822 }
2823
2824 static void print_short_term(H264Context *h);
2825 static void print_long_term(H264Context *h);
2826
2827 /**
2828  * Extract structure information about the picture described by pic_num in
2829  * the current decoding context (frame or field). Note that pic_num is
2830  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
2831  * @param pic_num picture number for which to extract structure information
2832  * @param structure one of PICT_XXX describing structure of picture
2833  *                      with pic_num
2834  * @return frame number (short term) or long term index of picture
2835  *         described by pic_num
2836  */
2837 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
2838     MpegEncContext * const s = &h->s;
2839
2840     *structure = s->picture_structure;
2841     if(FIELD_PICTURE){
2842         if (!(pic_num & 1))
2843             /* opposite field */
2844             *structure ^= PICT_FRAME;
2845         pic_num >>= 1;
2846     }
2847
2848     return pic_num;
2849 }
2850
2851 static int decode_ref_pic_list_reordering(H264Context *h){
2852     MpegEncContext * const s = &h->s;
2853     int list, index, pic_structure;
2854
2855     print_short_term(h);
2856     print_long_term(h);
2857
2858     for(list=0; list<h->list_count; list++){
2859         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
2860
2861         if(get_bits1(&s->gb)){
2862             int pred= h->curr_pic_num;
2863
2864             for(index=0; ; index++){
2865                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb_31(&s->gb);
2866                 unsigned int pic_id;
2867                 int i;
2868                 Picture *ref = NULL;
2869
2870                 if(reordering_of_pic_nums_idc==3)
2871                     break;
2872
2873                 if(index >= h->ref_count[list]){
2874                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
2875                     return -1;
2876                 }
2877
2878                 if(reordering_of_pic_nums_idc<3){
2879                     if(reordering_of_pic_nums_idc<2){
2880                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
2881                         int frame_num;
2882
2883                         if(abs_diff_pic_num > h->max_pic_num){
2884                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
2885                             return -1;
2886                         }
2887
2888                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
2889                         else                                pred+= abs_diff_pic_num;
2890                         pred &= h->max_pic_num - 1;
2891
2892                         frame_num = pic_num_extract(h, pred, &pic_structure);
2893
2894                         for(i= h->short_ref_count-1; i>=0; i--){
2895                             ref = h->short_ref[i];
2896                             assert(ref->reference);
2897                             assert(!ref->long_ref);
2898                             if(
2899                                    ref->frame_num == frame_num &&
2900                                    (ref->reference & pic_structure)
2901                               )
2902                                 break;
2903                         }
2904                         if(i>=0)
2905                             ref->pic_id= pred;
2906                     }else{
2907                         int long_idx;
2908                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
2909
2910                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
2911
2912                         if(long_idx>31){
2913                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
2914                             return -1;
2915                         }
2916                         ref = h->long_ref[long_idx];
2917                         assert(!(ref && !ref->reference));
2918                         if(ref && (ref->reference & pic_structure)){
2919                             ref->pic_id= pic_id;
2920                             assert(ref->long_ref);
2921                             i=0;
2922                         }else{
2923                             i=-1;
2924                         }
2925                     }
2926
2927                     if (i < 0) {
2928                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
2929                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
2930                     } else {
2931                         for(i=index; i+1<h->ref_count[list]; i++){
2932                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
2933                                 break;
2934                         }
2935                         for(; i > index; i--){
2936                             h->ref_list[list][i]= h->ref_list[list][i-1];
2937                         }
2938                         h->ref_list[list][index]= *ref;
2939                         if (FIELD_PICTURE){
2940                             pic_as_field(&h->ref_list[list][index], pic_structure);
2941                         }
2942                     }
2943                 }else{
2944                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
2945                     return -1;
2946                 }
2947             }
2948         }
2949     }
2950     for(list=0; list<h->list_count; list++){
2951         for(index= 0; index < h->ref_count[list]; index++){
2952             if(!h->ref_list[list][index].data[0]){
2953                 av_log(h->s.avctx, AV_LOG_ERROR, "Missing reference picture\n");
2954                 if(h->default_ref_list[list][0].data[0])
2955                     h->ref_list[list][index]= h->default_ref_list[list][0];
2956                 else
2957                     return -1;
2958             }
2959         }
2960     }
2961
2962     return 0;
2963 }
2964
2965 static void fill_mbaff_ref_list(H264Context *h){
2966     int list, i, j;
2967     for(list=0; list<2; list++){ //FIXME try list_count
2968         for(i=0; i<h->ref_count[list]; i++){
2969             Picture *frame = &h->ref_list[list][i];
2970             Picture *field = &h->ref_list[list][16+2*i];
2971             field[0] = *frame;
2972             for(j=0; j<3; j++)
2973                 field[0].linesize[j] <<= 1;
2974             field[0].reference = PICT_TOP_FIELD;
2975             field[0].poc= field[0].field_poc[0];
2976             field[1] = field[0];
2977             for(j=0; j<3; j++)
2978                 field[1].data[j] += frame->linesize[j];
2979             field[1].reference = PICT_BOTTOM_FIELD;
2980             field[1].poc= field[1].field_poc[1];
2981
2982             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
2983             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
2984             for(j=0; j<2; j++){
2985                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
2986                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
2987             }
2988         }
2989     }
2990     for(j=0; j<h->ref_count[1]; j++){
2991         for(i=0; i<h->ref_count[0]; i++)
2992             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
2993         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
2994         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
2995     }
2996 }
2997
2998 static int pred_weight_table(H264Context *h){
2999     MpegEncContext * const s = &h->s;
3000     int list, i;
3001     int luma_def, chroma_def;
3002
3003     h->use_weight= 0;
3004     h->use_weight_chroma= 0;
3005     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
3006     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
3007     luma_def = 1<<h->luma_log2_weight_denom;
3008     chroma_def = 1<<h->chroma_log2_weight_denom;
3009
3010     for(list=0; list<2; list++){
3011         h->luma_weight_flag[list]   = 0;
3012         h->chroma_weight_flag[list] = 0;
3013         for(i=0; i<h->ref_count[list]; i++){
3014             int luma_weight_flag, chroma_weight_flag;
3015
3016             luma_weight_flag= get_bits1(&s->gb);
3017             if(luma_weight_flag){
3018                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3019                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3020                 if(   h->luma_weight[list][i] != luma_def
3021                    || h->luma_offset[list][i] != 0) {
3022                     h->use_weight= 1;
3023                     h->luma_weight_flag[list]= 1;
3024                 }
3025             }else{
3026                 h->luma_weight[list][i]= luma_def;
3027                 h->luma_offset[list][i]= 0;
3028             }
3029
3030             if(CHROMA){
3031                 chroma_weight_flag= get_bits1(&s->gb);
3032                 if(chroma_weight_flag){
3033                     int j;
3034                     for(j=0; j<2; j++){
3035                         h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3036                         h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3037                         if(   h->chroma_weight[list][i][j] != chroma_def
3038                            || h->chroma_offset[list][i][j] != 0) {
3039                             h->use_weight_chroma= 1;
3040                             h->chroma_weight_flag[list]= 1;
3041                         }
3042                     }
3043                 }else{
3044                     int j;
3045                     for(j=0; j<2; j++){
3046                         h->chroma_weight[list][i][j]= chroma_def;
3047                         h->chroma_offset[list][i][j]= 0;
3048                     }
3049                 }
3050             }
3051         }
3052         if(h->slice_type_nos != FF_B_TYPE) break;
3053     }
3054     h->use_weight= h->use_weight || h->use_weight_chroma;
3055     return 0;
3056 }
3057
3058 static void implicit_weight_table(H264Context *h){
3059     MpegEncContext * const s = &h->s;
3060     int ref0, ref1, i;
3061     int cur_poc = s->current_picture_ptr->poc;
3062
3063     for (i = 0; i < 2; i++) {
3064         h->luma_weight_flag[i]   = 0;
3065         h->chroma_weight_flag[i] = 0;
3066     }
3067
3068     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3069        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3070         h->use_weight= 0;
3071         h->use_weight_chroma= 0;
3072         return;
3073     }
3074
3075     h->use_weight= 2;
3076     h->use_weight_chroma= 2;
3077     h->luma_log2_weight_denom= 5;
3078     h->chroma_log2_weight_denom= 5;
3079
3080     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3081         int poc0 = h->ref_list[0][ref0].poc;
3082         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3083             int poc1 = h->ref_list[1][ref1].poc;
3084             int td = av_clip(poc1 - poc0, -128, 127);
3085             if(td){
3086                 int tb = av_clip(cur_poc - poc0, -128, 127);
3087                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3088                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3089                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3090                     h->implicit_weight[ref0][ref1] = 32;
3091                 else
3092                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3093             }else
3094                 h->implicit_weight[ref0][ref1] = 32;
3095         }
3096     }
3097 }
3098
3099 /**
3100  * Mark a picture as no longer needed for reference. The refmask
3101  * argument allows unreferencing of individual fields or the whole frame.
3102  * If the picture becomes entirely unreferenced, but is being held for
3103  * display purposes, it is marked as such.
3104  * @param refmask mask of fields to unreference; the mask is bitwise
3105  *                anded with the reference marking of pic
3106  * @return non-zero if pic becomes entirely unreferenced (except possibly
3107  *         for display purposes) zero if one of the fields remains in
3108  *         reference
3109  */
3110 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3111     int i;
3112     if (pic->reference &= refmask) {
3113         return 0;
3114     } else {
3115         for(i = 0; h->delayed_pic[i]; i++)
3116             if(pic == h->delayed_pic[i]){
3117                 pic->reference=DELAYED_PIC_REF;
3118                 break;
3119             }
3120         return 1;
3121     }
3122 }
3123
3124 /**
3125  * instantaneous decoder refresh.
3126  */
3127 static void idr(H264Context *h){
3128     int i;
3129
3130     for(i=0; i<16; i++){
3131         remove_long(h, i, 0);
3132     }
3133     assert(h->long_ref_count==0);
3134
3135     for(i=0; i<h->short_ref_count; i++){
3136         unreference_pic(h, h->short_ref[i], 0);
3137         h->short_ref[i]= NULL;
3138     }
3139     h->short_ref_count=0;
3140     h->prev_frame_num= 0;
3141     h->prev_frame_num_offset= 0;
3142     h->prev_poc_msb=
3143     h->prev_poc_lsb= 0;
3144 }
3145
3146 /* forget old pics after a seek */
3147 static void flush_dpb(AVCodecContext *avctx){
3148     H264Context *h= avctx->priv_data;
3149     int i;
3150     for(i=0; i<MAX_DELAYED_PIC_COUNT; i++) {
3151         if(h->delayed_pic[i])
3152             h->delayed_pic[i]->reference= 0;
3153         h->delayed_pic[i]= NULL;
3154     }
3155     h->outputed_poc= INT_MIN;
3156     idr(h);
3157     if(h->s.current_picture_ptr)
3158         h->s.current_picture_ptr->reference= 0;
3159     h->s.first_field= 0;
3160     reset_sei(h);
3161     ff_mpeg_flush(avctx);
3162 }
3163
3164 /**
3165  * Find a Picture in the short term reference list by frame number.
3166  * @param frame_num frame number to search for
3167  * @param idx the index into h->short_ref where returned picture is found
3168  *            undefined if no picture found.
3169  * @return pointer to the found picture, or NULL if no pic with the provided
3170  *                 frame number is found
3171  */
3172 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3173     MpegEncContext * const s = &h->s;
3174     int i;
3175
3176     for(i=0; i<h->short_ref_count; i++){
3177         Picture *pic= h->short_ref[i];
3178         if(s->avctx->debug&FF_DEBUG_MMCO)
3179             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3180         if(pic->frame_num == frame_num) {
3181             *idx = i;
3182             return pic;
3183         }
3184     }
3185     return NULL;
3186 }
3187
3188 /**
3189  * Remove a picture from the short term reference list by its index in
3190  * that list.  This does no checking on the provided index; it is assumed
3191  * to be valid. Other list entries are shifted down.
3192  * @param i index into h->short_ref of picture to remove.
3193  */
3194 static void remove_short_at_index(H264Context *h, int i){
3195     assert(i >= 0 && i < h->short_ref_count);
3196     h->short_ref[i]= NULL;
3197     if (--h->short_ref_count)
3198         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3199 }
3200
3201 /**
3202  *
3203  * @return the removed picture or NULL if an error occurs
3204  */
3205 static Picture * remove_short(H264Context *h, int frame_num, int ref_mask){
3206     MpegEncContext * const s = &h->s;
3207     Picture *pic;
3208     int i;
3209
3210     if(s->avctx->debug&FF_DEBUG_MMCO)
3211         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3212
3213     pic = find_short(h, frame_num, &i);
3214     if (pic){
3215         if(unreference_pic(h, pic, ref_mask))
3216         remove_short_at_index(h, i);
3217     }
3218
3219     return pic;
3220 }
3221
3222 /**
3223  * Remove a picture from the long term reference list by its index in
3224  * that list.
3225  * @return the removed picture or NULL if an error occurs
3226  */
3227 static Picture * remove_long(H264Context *h, int i, int ref_mask){
3228     Picture *pic;
3229
3230     pic= h->long_ref[i];
3231     if (pic){
3232         if(unreference_pic(h, pic, ref_mask)){
3233             assert(h->long_ref[i]->long_ref == 1);
3234             h->long_ref[i]->long_ref= 0;
3235             h->long_ref[i]= NULL;
3236             h->long_ref_count--;
3237         }
3238     }
3239
3240     return pic;
3241 }
3242
3243 /**
3244  * print short term list
3245  */
3246 static void print_short_term(H264Context *h) {
3247     uint32_t i;
3248     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3249         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3250         for(i=0; i<h->short_ref_count; i++){
3251             Picture *pic= h->short_ref[i];
3252             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3253         }
3254     }
3255 }
3256
3257 /**
3258  * print long term list
3259  */
3260 static void print_long_term(H264Context *h) {
3261     uint32_t i;
3262     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3263         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3264         for(i = 0; i < 16; i++){
3265             Picture *pic= h->long_ref[i];
3266             if (pic) {
3267                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3268             }
3269         }
3270     }
3271 }
3272
3273 /**
3274  * Executes the reference picture marking (memory management control operations).
3275  */
3276 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3277     MpegEncContext * const s = &h->s;
3278     int i, av_uninit(j);
3279     int current_ref_assigned=0;
3280     Picture *av_uninit(pic);
3281
3282     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3283         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3284
3285     for(i=0; i<mmco_count; i++){
3286         int av_uninit(structure), av_uninit(frame_num);
3287         if(s->avctx->debug&FF_DEBUG_MMCO)
3288             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3289
3290         if(   mmco[i].opcode == MMCO_SHORT2UNUSED
3291            || mmco[i].opcode == MMCO_SHORT2LONG){
3292             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3293             pic = find_short(h, frame_num, &j);
3294             if(!pic){
3295                 if(mmco[i].opcode != MMCO_SHORT2LONG || !h->long_ref[mmco[i].long_arg]
3296                    || h->long_ref[mmco[i].long_arg]->frame_num != frame_num)
3297                 av_log(h->s.avctx, AV_LOG_ERROR, "mmco: unref short failure\n");
3298                 continue;
3299             }
3300         }
3301
3302         switch(mmco[i].opcode){
3303         case MMCO_SHORT2UNUSED:
3304             if(s->avctx->debug&FF_DEBUG_MMCO)
3305                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3306             remove_short(h, frame_num, structure ^ PICT_FRAME);
3307             break;
3308         case MMCO_SHORT2LONG:
3309                 if (h->long_ref[mmco[i].long_arg] != pic)
3310                     remove_long(h, mmco[i].long_arg, 0);
3311
3312                 remove_short_at_index(h, j);
3313                 h->long_ref[ mmco[i].long_arg ]= pic;
3314                 if (h->long_ref[ mmco[i].long_arg ]){
3315                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3316                     h->long_ref_count++;
3317                 }
3318             break;
3319         case MMCO_LONG2UNUSED:
3320             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3321             pic = h->long_ref[j];
3322             if (pic) {
3323                 remove_long(h, j, structure ^ PICT_FRAME);
3324             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3325                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3326             break;
3327         case MMCO_LONG:
3328                     // Comment below left from previous code as it is an interresting note.
3329                     /* First field in pair is in short term list or
3330                      * at a different long term index.
3331                      * This is not allowed; see 7.4.3.3, notes 2 and 3.
3332                      * Report the problem and keep the pair where it is,
3333                      * and mark this field valid.
3334                      */
3335
3336             if (h->long_ref[mmco[i].long_arg] != s->current_picture_ptr) {
3337                 remove_long(h, mmco[i].long_arg, 0);
3338
3339                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3340                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3341                 h->long_ref_count++;
3342             }
3343
3344             s->current_picture_ptr->reference |= s->picture_structure;
3345             current_ref_assigned=1;
3346             break;
3347         case MMCO_SET_MAX_LONG:
3348             assert(mmco[i].long_arg <= 16);
3349             // just remove the long term which index is greater than new max
3350             for(j = mmco[i].long_arg; j<16; j++){
3351                 remove_long(h, j, 0);
3352             }
3353             break;
3354         case MMCO_RESET:
3355             while(h->short_ref_count){
3356                 remove_short(h, h->short_ref[0]->frame_num, 0);
3357             }
3358             for(j = 0; j < 16; j++) {
3359                 remove_long(h, j, 0);
3360             }
3361             s->current_picture_ptr->poc=
3362             s->current_picture_ptr->field_poc[0]=
3363             s->current_picture_ptr->field_poc[1]=
3364             h->poc_lsb=
3365             h->poc_msb=
3366             h->frame_num=
3367             s->current_picture_ptr->frame_num= 0;
3368             break;
3369         default: assert(0);
3370         }
3371     }
3372
3373     if (!current_ref_assigned) {
3374         /* Second field of complementary field pair; the first field of
3375          * which is already referenced. If short referenced, it
3376          * should be first entry in short_ref. If not, it must exist
3377          * in long_ref; trying to put it on the short list here is an
3378          * error in the encoded bit stream (ref: 7.4.3.3, NOTE 2 and 3).
3379          */
3380         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3381             /* Just mark the second field valid */
3382             s->current_picture_ptr->reference = PICT_FRAME;
3383         } else if (s->current_picture_ptr->long_ref) {
3384             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3385                                              "assignment for second field "
3386                                              "in complementary field pair "
3387                                              "(first field is long term)\n");
3388         } else {
3389             pic= remove_short(h, s->current_picture_ptr->frame_num, 0);
3390             if(pic){
3391                 av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3392             }
3393
3394             if(h->short_ref_count)
3395                 memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3396
3397             h->short_ref[0]= s->current_picture_ptr;
3398             h->short_ref_count++;
3399             s->current_picture_ptr->reference |= s->picture_structure;
3400         }
3401     }
3402
3403     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3404
3405         /* We have too many reference frames, probably due to corrupted
3406          * stream. Need to discard one frame. Prevents overrun of the
3407          * short_ref and long_ref buffers.
3408          */
3409         av_log(h->s.avctx, AV_LOG_ERROR,
3410                "number of reference frames exceeds max (probably "
3411                "corrupt input), discarding one\n");
3412
3413         if (h->long_ref_count && !h->short_ref_count) {
3414             for (i = 0; i < 16; ++i)
3415                 if (h->long_ref[i])
3416                     break;
3417
3418             assert(i < 16);
3419             remove_long(h, i, 0);
3420         } else {
3421             pic = h->short_ref[h->short_ref_count - 1];
3422             remove_short(h, pic->frame_num, 0);
3423         }
3424     }
3425
3426     print_short_term(h);
3427     print_long_term(h);
3428     return 0;
3429 }
3430
3431 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3432     MpegEncContext * const s = &h->s;
3433     int i;
3434
3435     h->mmco_index= 0;
3436     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3437         s->broken_link= get_bits1(gb) -1;
3438         if(get_bits1(gb)){
3439             h->mmco[0].opcode= MMCO_LONG;
3440             h->mmco[0].long_arg= 0;
3441             h->mmco_index= 1;
3442         }
3443     }else{
3444         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3445             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3446                 MMCOOpcode opcode= get_ue_golomb_31(gb);
3447
3448                 h->mmco[i].opcode= opcode;
3449                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3450                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3451 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3452                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3453                         return -1;
3454                     }*/
3455                 }
3456                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3457                     unsigned int long_arg= get_ue_golomb_31(gb);
3458                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3459                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3460                         return -1;
3461                     }
3462                     h->mmco[i].long_arg= long_arg;
3463                 }
3464
3465                 if(opcode > (unsigned)MMCO_LONG){
3466                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3467                     return -1;
3468                 }
3469                 if(opcode == MMCO_END)
3470                     break;
3471             }
3472             h->mmco_index= i;
3473         }else{
3474             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3475
3476             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3477                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3478                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3479                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3480                 h->mmco_index= 1;
3481                 if (FIELD_PICTURE) {
3482                     h->mmco[0].short_pic_num *= 2;
3483                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3484                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3485                     h->mmco_index= 2;
3486                 }
3487             }
3488         }
3489     }
3490
3491     return 0;
3492 }
3493
3494 static int init_poc(H264Context *h){
3495     MpegEncContext * const s = &h->s;
3496     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3497     int field_poc[2];
3498     Picture *cur = s->current_picture_ptr;
3499
3500     h->frame_num_offset= h->prev_frame_num_offset;
3501     if(h->frame_num < h->prev_frame_num)
3502         h->frame_num_offset += max_frame_num;
3503
3504     if(h->sps.poc_type==0){
3505         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3506
3507         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3508             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3509         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3510             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3511         else
3512             h->poc_msb = h->prev_poc_msb;
3513 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3514         field_poc[0] =
3515         field_poc[1] = h->poc_msb + h->poc_lsb;
3516         if(s->picture_structure == PICT_FRAME)
3517             field_poc[1] += h->delta_poc_bottom;
3518     }else if(h->sps.poc_type==1){
3519         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3520         int i;
3521
3522         if(h->sps.poc_cycle_length != 0)
3523             abs_frame_num = h->frame_num_offset + h->frame_num;
3524         else
3525             abs_frame_num = 0;
3526
3527         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3528             abs_frame_num--;
3529
3530         expected_delta_per_poc_cycle = 0;
3531         for(i=0; i < h->sps.poc_cycle_length; i++)
3532             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3533
3534         if(abs_frame_num > 0){
3535             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3536             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3537
3538             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3539             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3540                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3541         } else
3542             expectedpoc = 0;
3543
3544         if(h->nal_ref_idc == 0)
3545             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3546
3547         field_poc[0] = expectedpoc + h->delta_poc[0];
3548         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3549
3550         if(s->picture_structure == PICT_FRAME)
3551             field_poc[1] += h->delta_poc[1];
3552     }else{
3553         int poc= 2*(h->frame_num_offset + h->frame_num);
3554
3555         if(!h->nal_ref_idc)
3556             poc--;
3557
3558         field_poc[0]= poc;
3559         field_poc[1]= poc;
3560     }
3561
3562     if(s->picture_structure != PICT_BOTTOM_FIELD)
3563         s->current_picture_ptr->field_poc[0]= field_poc[0];
3564     if(s->picture_structure != PICT_TOP_FIELD)
3565         s->current_picture_ptr->field_poc[1]= field_poc[1];
3566     cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3567
3568     return 0;
3569 }
3570
3571
3572 /**
3573  * initialize scan tables
3574  */
3575 static void init_scan_tables(H264Context *h){
3576     MpegEncContext * const s = &h->s;
3577     int i;
3578     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3579         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3580         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3581     }else{
3582         for(i=0; i<16; i++){
3583 #define T(x) (x>>2) | ((x<<2) & 0xF)
3584             h->zigzag_scan[i] = T(zigzag_scan[i]);
3585             h-> field_scan[i] = T( field_scan[i]);
3586 #undef T
3587         }
3588     }
3589     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3590         memcpy(h->zigzag_scan8x8,       ff_zigzag_direct,     64*sizeof(uint8_t));
3591         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3592         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3593         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3594     }else{
3595         for(i=0; i<64; i++){
3596 #define T(x) (x>>3) | ((x&7)<<3)
3597             h->zigzag_scan8x8[i]       = T(ff_zigzag_direct[i]);
3598             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3599             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3600             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3601 #undef T
3602         }
3603     }
3604     if(h->sps.transform_bypass){ //FIXME same ugly
3605         h->zigzag_scan_q0          = zigzag_scan;
3606         h->zigzag_scan8x8_q0       = ff_zigzag_direct;
3607         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3608         h->field_scan_q0           = field_scan;
3609         h->field_scan8x8_q0        = field_scan8x8;
3610         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3611     }else{
3612         h->zigzag_scan_q0          = h->zigzag_scan;
3613         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3614         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3615         h->field_scan_q0           = h->field_scan;
3616         h->field_scan8x8_q0        = h->field_scan8x8;
3617         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3618     }
3619 }
3620
3621 static void field_end(H264Context *h){
3622     MpegEncContext * const s = &h->s;
3623     AVCodecContext * const avctx= s->avctx;
3624     s->mb_y= 0;
3625
3626     s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
3627     s->current_picture_ptr->pict_type= s->pict_type;
3628
3629     if (CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
3630         ff_vdpau_h264_set_reference_frames(s);
3631
3632     if(!s->dropable) {
3633         execute_ref_pic_marking(h, h->mmco, h->mmco_index);
3634         h->prev_poc_msb= h->poc_msb;
3635         h->prev_poc_lsb= h->poc_lsb;
3636     }
3637     h->prev_frame_num_offset= h->frame_num_offset;
3638     h->prev_frame_num= h->frame_num;
3639
3640     if (avctx->hwaccel) {
3641         if (avctx->hwaccel->end_frame(avctx) < 0)
3642             av_log(avctx, AV_LOG_ERROR, "hardware accelerator failed to decode picture\n");
3643     }
3644
3645     if (CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
3646         ff_vdpau_h264_picture_complete(s);
3647
3648     /*
3649      * FIXME: Error handling code does not seem to support interlaced
3650      * when slices span multiple rows
3651      * The ff_er_add_slice calls don't work right for bottom
3652      * fields; they cause massive erroneous error concealing
3653      * Error marking covers both fields (top and bottom).
3654      * This causes a mismatched s->error_count
3655      * and a bad error table. Further, the error count goes to
3656      * INT_MAX when called for bottom field, because mb_y is
3657      * past end by one (callers fault) and resync_mb_y != 0
3658      * causes problems for the first MB line, too.
3659      */
3660     if (!FIELD_PICTURE)
3661         ff_er_frame_end(s);
3662
3663     MPV_frame_end(s);
3664
3665     h->current_slice=0;
3666 }
3667
3668 /**
3669  * Replicates H264 "master" context to thread contexts.
3670  */
3671 static void clone_slice(H264Context *dst, H264Context *src)
3672 {
3673     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3674     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3675     dst->s.current_picture      = src->s.current_picture;
3676     dst->s.linesize             = src->s.linesize;
3677     dst->s.uvlinesize           = src->s.uvlinesize;
3678     dst->s.first_field          = src->s.first_field;
3679
3680     dst->prev_poc_msb           = src->prev_poc_msb;
3681     dst->prev_poc_lsb           = src->prev_poc_lsb;
3682     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3683     dst->prev_frame_num         = src->prev_frame_num;
3684     dst->short_ref_count        = src->short_ref_count;
3685
3686     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3687     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3688     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3689     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3690
3691     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3692     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3693 }
3694
3695 /**
3696  * decodes a slice header.
3697  * This will also call MPV_common_init() and frame_start() as needed.
3698  *
3699  * @param h h264context
3700  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3701  *
3702  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
3703  */
3704 static int decode_slice_header(H264Context *h, H264Context *h0){
3705     MpegEncContext * const s = &h->s;
3706     MpegEncContext * const s0 = &h0->s;
3707     unsigned int first_mb_in_slice;
3708     unsigned int pps_id;
3709     int num_ref_idx_active_override_flag;
3710     unsigned int slice_type, tmp, i, j;
3711     int default_ref_list_done = 0;
3712     int last_pic_structure;
3713
3714     s->dropable= h->nal_ref_idc == 0;
3715
3716     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3717         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3718         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3719     }else{
3720         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3721         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3722     }
3723
3724     first_mb_in_slice= get_ue_golomb(&s->gb);
3725
3726     if(first_mb_in_slice == 0){ //FIXME better field boundary detection
3727         if(h0->current_slice && FIELD_PICTURE){
3728             field_end(h);
3729         }
3730
3731         h0->current_slice = 0;
3732         if (!s0->first_field)
3733             s->current_picture_ptr= NULL;
3734     }
3735
3736     slice_type= get_ue_golomb_31(&s->gb);
3737     if(slice_type > 9){
3738         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3739         return -1;
3740     }
3741     if(slice_type > 4){
3742         slice_type -= 5;
3743         h->slice_type_fixed=1;
3744     }else
3745         h->slice_type_fixed=0;
3746
3747     slice_type= golomb_to_pict_type[ slice_type ];
3748     if (slice_type == FF_I_TYPE
3749         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3750         default_ref_list_done = 1;
3751     }
3752     h->slice_type= slice_type;
3753     h->slice_type_nos= slice_type & 3;
3754
3755     s->pict_type= h->slice_type; // to make a few old functions happy, it's wrong though
3756     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3757         av_log(h->s.avctx, AV_LOG_ERROR,
3758                "B picture before any references, skipping\n");
3759         return -1;
3760     }
3761
3762     pps_id= get_ue_golomb(&s->gb);
3763     if(pps_id>=MAX_PPS_COUNT){
3764         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3765         return -1;
3766     }
3767     if(!h0->pps_buffers[pps_id]) {
3768         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing PPS %u referenced\n", pps_id);
3769         return -1;
3770     }
3771     h->pps= *h0->pps_buffers[pps_id];
3772
3773     if(!h0->sps_buffers[h->pps.sps_id]) {
3774         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS %u referenced\n", h->pps.sps_id);
3775         return -1;
3776     }
3777     h->sps = *h0->sps_buffers[h->pps.sps_id];
3778
3779     if(h == h0 && h->dequant_coeff_pps != pps_id){
3780         h->dequant_coeff_pps = pps_id;
3781         init_dequant_tables(h);
3782     }
3783
3784     s->mb_width= h->sps.mb_width;
3785     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3786
3787     h->b_stride=  s->mb_width*4;
3788     h->b8_stride= s->mb_width*2;
3789
3790     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
3791     if(h->sps.frame_mbs_only_flag)
3792         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
3793     else
3794         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 3);
3795
3796     if (s->context_initialized
3797         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3798         if(h != h0)
3799             return -1;   // width / height changed during parallelized decoding
3800         free_tables(h);
3801         flush_dpb(s->avctx);
3802         MPV_common_end(s);
3803     }
3804     if (!s->context_initialized) {
3805         if(h != h0)
3806             return -1;  // we cant (re-)initialize context during parallel decoding
3807         if (MPV_common_init(s) < 0)
3808             return -1;
3809         s->first_field = 0;
3810
3811         init_scan_tables(h);
3812         alloc_tables(h);
3813
3814         for(i = 1; i < s->avctx->thread_count; i++) {
3815             H264Context *c;
3816             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3817             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3818             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3819             c->sps = h->sps;
3820             c->pps = h->pps;
3821             init_scan_tables(c);
3822             clone_tables(c, h);
3823         }
3824
3825         for(i = 0; i < s->avctx->thread_count; i++)
3826             if(context_init(h->thread_context[i]) < 0)
3827                 return -1;
3828
3829         s->avctx->width = s->width;
3830         s->avctx->height = s->height;
3831         s->avctx->sample_aspect_ratio= h->sps.sar;
3832         if(!s->avctx->sample_aspect_ratio.den)
3833             s->avctx->sample_aspect_ratio.den = 1;
3834
3835         if(h->sps.timing_info_present_flag){
3836             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick, h->sps.time_scale};
3837             if(h->x264_build > 0 && h->x264_build < 44)
3838                 s->avctx->time_base.den *= 2;
3839             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3840                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3841         }
3842     }
3843
3844     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3845
3846     h->mb_mbaff = 0;
3847     h->mb_aff_frame = 0;
3848     last_pic_structure = s0->picture_structure;
3849     if(h->sps.frame_mbs_only_flag){
3850         s->picture_structure= PICT_FRAME;
3851     }else{
3852         if(get_bits1(&s->gb)) { //field_pic_flag
3853             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3854         } else {
3855             s->picture_structure= PICT_FRAME;
3856             h->mb_aff_frame = h->sps.mb_aff;
3857         }
3858     }
3859     h->mb_field_decoding_flag= s->picture_structure != PICT_FRAME;
3860
3861     if(h0->current_slice == 0){
3862         while(h->frame_num !=  h->prev_frame_num &&
3863               h->frame_num != (h->prev_frame_num+1)%(1<<h->sps.log2_max_frame_num)){
3864             av_log(NULL, AV_LOG_DEBUG, "Frame num gap %d %d\n", h->frame_num, h->prev_frame_num);
3865             if (frame_start(h) < 0)
3866                 return -1;
3867             h->prev_frame_num++;
3868             h->prev_frame_num %= 1<<h->sps.log2_max_frame_num;
3869             s->current_picture_ptr->frame_num= h->prev_frame_num;
3870             execute_ref_pic_marking(h, NULL, 0);
3871         }
3872
3873         /* See if we have a decoded first field looking for a pair... */
3874         if (s0->first_field) {
3875             assert(s0->current_picture_ptr);
3876             assert(s0->current_picture_ptr->data[0]);
3877             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
3878
3879             /* figure out if we have a complementary field pair */
3880             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
3881                 /*
3882                  * Previous field is unmatched. Don't display it, but let it
3883                  * remain for reference if marked as such.
3884                  */
3885                 s0->current_picture_ptr = NULL;
3886                 s0->first_field = FIELD_PICTURE;
3887
3888             } else {
3889                 if (h->nal_ref_idc &&
3890                         s0->current_picture_ptr->reference &&
3891                         s0->current_picture_ptr->frame_num != h->frame_num) {
3892                     /*
3893                      * This and previous field were reference, but had
3894                      * different frame_nums. Consider this field first in
3895                      * pair. Throw away previous field except for reference
3896                      * purposes.
3897                      */
3898                     s0->first_field = 1;
3899                     s0->current_picture_ptr = NULL;
3900
3901                 } else {
3902                     /* Second field in complementary pair */
3903                     s0->first_field = 0;
3904                 }
3905             }
3906
3907         } else {
3908             /* Frame or first field in a potentially complementary pair */
3909             assert(!s0->current_picture_ptr);
3910             s0->first_field = FIELD_PICTURE;
3911         }
3912
3913         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
3914             s0->first_field = 0;
3915             return -1;
3916         }
3917     }
3918     if(h != h0)
3919         clone_slice(h, h0);
3920
3921     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
3922
3923     assert(s->mb_num == s->mb_width * s->mb_height);
3924     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
3925        first_mb_in_slice                    >= s->mb_num){
3926         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
3927         return -1;
3928     }
3929     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
3930     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
3931     if (s->picture_structure == PICT_BOTTOM_FIELD)
3932         s->resync_mb_y = s->mb_y = s->mb_y + 1;
3933     assert(s->mb_y < s->mb_height);
3934
3935     if(s->picture_structure==PICT_FRAME){
3936         h->curr_pic_num=   h->frame_num;
3937         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
3938     }else{
3939         h->curr_pic_num= 2*h->frame_num + 1;
3940         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
3941     }
3942
3943     if(h->nal_unit_type == NAL_IDR_SLICE){
3944         get_ue_golomb(&s->gb); /* idr_pic_id */
3945     }
3946
3947     if(h->sps.poc_type==0){
3948         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
3949
3950         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
3951             h->delta_poc_bottom= get_se_golomb(&s->gb);
3952         }
3953     }
3954
3955     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
3956         h->delta_poc[0]= get_se_golomb(&s->gb);
3957
3958         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
3959             h->delta_poc[1]= get_se_golomb(&s->gb);
3960     }
3961
3962     init_poc(h);
3963
3964     if(h->pps.redundant_pic_cnt_present){
3965         h->redundant_pic_count= get_ue_golomb(&s->gb);
3966     }
3967
3968     //set defaults, might be overridden a few lines later
3969     h->ref_count[0]= h->pps.ref_count[0];
3970     h->ref_count[1]= h->pps.ref_count[1];
3971
3972     if(h->slice_type_nos != FF_I_TYPE){
3973         if(h->slice_type_nos == FF_B_TYPE){
3974             h->direct_spatial_mv_pred= get_bits1(&s->gb);
3975         }
3976         num_ref_idx_active_override_flag= get_bits1(&s->gb);
3977
3978         if(num_ref_idx_active_override_flag){
3979             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
3980             if(h->slice_type_nos==FF_B_TYPE)
3981                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
3982
3983             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
3984                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
3985                 h->ref_count[0]= h->ref_count[1]= 1;
3986                 return -1;
3987             }
3988         }
3989         if(h->slice_type_nos == FF_B_TYPE)
3990             h->list_count= 2;
3991         else
3992             h->list_count= 1;
3993     }else
3994         h->list_count= 0;
3995
3996     if(!default_ref_list_done){
3997         fill_default_ref_list(h);
3998     }
3999
4000     if(h->slice_type_nos!=FF_I_TYPE && decode_ref_pic_list_reordering(h) < 0)
4001         return -1;
4002
4003     if(h->slice_type_nos!=FF_I_TYPE){
4004         s->last_picture_ptr= &h->ref_list[0][0];
4005         ff_copy_picture(&s->last_picture, s->last_picture_ptr);
4006     }
4007     if(h->slice_type_nos==FF_B_TYPE){
4008         s->next_picture_ptr= &h->ref_list[1][0];
4009         ff_copy_picture(&s->next_picture, s->next_picture_ptr);
4010     }
4011
4012     if(   (h->pps.weighted_pred          && h->slice_type_nos == FF_P_TYPE )
4013        ||  (h->pps.weighted_bipred_idc==1 && h->slice_type_nos== FF_B_TYPE ) )
4014         pred_weight_table(h);
4015     else if(h->pps.weighted_bipred_idc==2 && h->slice_type_nos== FF_B_TYPE)
4016         implicit_weight_table(h);
4017     else {
4018         h->use_weight = 0;
4019         for (i = 0; i < 2; i++) {
4020             h->luma_weight_flag[i]   = 0;
4021             h->chroma_weight_flag[i] = 0;
4022         }
4023     }
4024
4025     if(h->nal_ref_idc)
4026         decode_ref_pic_marking(h0, &s->gb);
4027
4028     if(FRAME_MBAFF)
4029         fill_mbaff_ref_list(h);
4030
4031     if(h->slice_type_nos==FF_B_TYPE && !h->direct_spatial_mv_pred)
4032         direct_dist_scale_factor(h);
4033     direct_ref_list_init(h);
4034
4035     if( h->slice_type_nos != FF_I_TYPE && h->pps.cabac ){
4036         tmp = get_ue_golomb_31(&s->gb);
4037         if(tmp > 2){
4038             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
4039             return -1;
4040         }
4041         h->cabac_init_idc= tmp;
4042     }
4043
4044     h->last_qscale_diff = 0;
4045     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
4046     if(tmp>51){
4047         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
4048         return -1;
4049     }
4050     s->qscale= tmp;
4051     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
4052     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
4053     //FIXME qscale / qp ... stuff
4054     if(h->slice_type == FF_SP_TYPE){
4055         get_bits1(&s->gb); /* sp_for_switch_flag */
4056     }
4057     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
4058         get_se_golomb(&s->gb); /* slice_qs_delta */
4059     }
4060
4061     h->deblocking_filter = 1;
4062     h->slice_alpha_c0_offset = 0;
4063     h->slice_beta_offset = 0;
4064     if( h->pps.deblocking_filter_parameters_present ) {
4065         tmp= get_ue_golomb_31(&s->gb);
4066         if(tmp > 2){
4067             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
4068             return -1;
4069         }
4070         h->deblocking_filter= tmp;
4071         if(h->deblocking_filter < 2)
4072             h->deblocking_filter^= 1; // 1<->0
4073
4074         if( h->deblocking_filter ) {
4075             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4076             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4077         }
4078     }
4079
4080     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4081        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type_nos != FF_I_TYPE)
4082        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type_nos == FF_B_TYPE)
4083        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4084         h->deblocking_filter= 0;
4085
4086     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
4087         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
4088             /* Cheat slightly for speed:
4089                Do not bother to deblock across slices. */
4090             h->deblocking_filter = 2;
4091         } else {
4092             h0->max_contexts = 1;
4093             if(!h0->single_decode_warning) {
4094                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
4095                 h0->single_decode_warning = 1;
4096             }
4097             if(h != h0)
4098                 return 1; // deblocking switched inside frame
4099         }
4100     }
4101
4102 #if 0 //FMO
4103     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4104         slice_group_change_cycle= get_bits(&s->gb, ?);
4105 #endif
4106
4107     h0->last_slice_type = slice_type;
4108     h->slice_num = ++h0->current_slice;
4109     if(h->slice_num >= MAX_SLICES){
4110         av_log(s->avctx, AV_LOG_ERROR, "Too many slices, increase MAX_SLICES and recompile\n");
4111     }
4112
4113     for(j=0; j<2; j++){
4114         int *ref2frm= h->ref2frm[h->slice_num&(MAX_SLICES-1)][j];
4115         ref2frm[0]=
4116         ref2frm[1]= -1;
4117         for(i=0; i<16; i++)
4118             ref2frm[i+2]= 4*h->ref_list[j][i].frame_num
4119                           +(h->ref_list[j][i].reference&3);
4120         ref2frm[18+0]=
4121         ref2frm[18+1]= -1;
4122         for(i=16; i<48; i++)
4123             ref2frm[i+4]= 4*h->ref_list[j][i].frame_num
4124                           +(h->ref_list[j][i].reference&3);
4125     }
4126
4127     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4128     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
4129
4130     s->avctx->refs= h->sps.ref_frame_count;
4131
4132     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4133         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c%s%s pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
4134                h->slice_num,
4135                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4136                first_mb_in_slice,
4137                av_get_pict_type_char(h->slice_type), h->slice_type_fixed ? " fix" : "", h->nal_unit_type == NAL_IDR_SLICE ? " IDR" : "",
4138                pps_id, h->frame_num,
4139                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4140                h->ref_count[0], h->ref_count[1],
4141                s->qscale,
4142                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4143                h->use_weight,
4144                h->use_weight==1 && h->use_weight_chroma ? "c" : "",
4145                h->slice_type == FF_B_TYPE ? (h->direct_spatial_mv_pred ? "SPAT" : "TEMP") : ""
4146                );
4147     }
4148
4149     return 0;
4150 }
4151
4152 /**
4153  *
4154  */
4155 static inline int get_level_prefix(GetBitContext *gb){
4156     unsigned int buf;
4157     int log;
4158
4159     OPEN_READER(re, gb);
4160     UPDATE_CACHE(re, gb);
4161     buf=GET_CACHE(re, gb);
4162
4163     log= 32 - av_log2(buf);
4164 #ifdef TRACE
4165     print_bin(buf>>(32-log), log);
4166     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4167 #endif
4168
4169     LAST_SKIP_BITS(re, gb, log);
4170     CLOSE_READER(re, gb);
4171
4172     return log-1;
4173 }
4174
4175 static inline int get_dct8x8_allowed(H264Context *h){
4176     if(h->sps.direct_8x8_inference_flag)
4177         return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8                )*0x0001000100010001ULL));
4178     else
4179         return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8|MB_TYPE_DIRECT2)*0x0001000100010001ULL));
4180 }
4181
4182 /**
4183  * decodes a residual block.
4184  * @param n block index
4185  * @param scantable scantable
4186  * @param max_coeff number of coefficients in the block
4187  * @return <0 if an error occurred
4188  */
4189 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4190     MpegEncContext * const s = &h->s;
4191     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4192     int level[16];
4193     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4194
4195     //FIXME put trailing_onex into the context
4196
4197     if(n == CHROMA_DC_BLOCK_INDEX){
4198         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4199         total_coeff= coeff_token>>2;
4200     }else{
4201         if(n == LUMA_DC_BLOCK_INDEX){
4202             total_coeff= pred_non_zero_count(h, 0);
4203             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4204             total_coeff= coeff_token>>2;
4205         }else{
4206             total_coeff= pred_non_zero_count(h, n);
4207             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4208             total_coeff= coeff_token>>2;
4209             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4210         }
4211     }
4212
4213     //FIXME set last_non_zero?
4214
4215     if(total_coeff==0)
4216         return 0;
4217     if(total_coeff > (unsigned)max_coeff) {
4218         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4219         return -1;
4220     }
4221
4222     trailing_ones= coeff_token&3;
4223     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4224     assert(total_coeff<=16);
4225
4226     i = show_bits(gb, 3);
4227     skip_bits(gb, trailing_ones);
4228     level[0] = 1-((i&4)>>1);
4229     level[1] = 1-((i&2)   );
4230     level[2] = 1-((i&1)<<1);
4231
4232     if(trailing_ones<total_coeff) {
4233         int mask, prefix;
4234         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4235         int bitsi= show_bits(gb, LEVEL_TAB_BITS);
4236         int level_code= cavlc_level_tab[suffix_length][bitsi][0];
4237
4238         skip_bits(gb, cavlc_level_tab[suffix_length][bitsi][1]);
4239         if(level_code >= 100){
4240             prefix= level_code - 100;
4241             if(prefix == LEVEL_TAB_BITS)
4242                 prefix += get_level_prefix(gb);
4243
4244             //first coefficient has suffix_length equal to 0 or 1
4245             if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4246                 if(suffix_length)
4247                     level_code= (prefix<<1) + get_bits1(gb); //part
4248                 else
4249                     level_code= prefix; //part
4250             }else if(prefix==14){
4251                 if(suffix_length)
4252                     level_code= (prefix<<1) + get_bits1(gb); //part
4253                 else
4254                     level_code= prefix + get_bits(gb, 4); //part
4255             }else{
4256                 level_code= 30 + get_bits(gb, prefix-3); //part
4257                 if(prefix>=16)
4258                     level_code += (1<<(prefix-3))-4096;
4259             }
4260
4261             if(trailing_ones < 3) level_code += 2;
4262
4263             suffix_length = 2;
4264             mask= -(level_code&1);
4265             level[trailing_ones]= (((2+level_code)>>1) ^ mask) - mask;
4266         }else{
4267             if(trailing_ones < 3) level_code += (level_code>>31)|1;
4268
4269             suffix_length = 1;
4270             if(level_code + 3U > 6U)
4271                 suffix_length++;
4272             level[trailing_ones]= level_code;
4273         }
4274
4275         //remaining coefficients have suffix_length > 0
4276         for(i=trailing_ones+1;i<total_coeff;i++) {
4277             static const unsigned int suffix_limit[7] = {0,3,6,12,24,48,INT_MAX };
4278             int bitsi= show_bits(gb, LEVEL_TAB_BITS);
4279             level_code= cavlc_level_tab[suffix_length][bitsi][0];
4280
4281             skip_bits(gb, cavlc_level_tab[suffix_length][bitsi][1]);
4282             if(level_code >= 100){
4283                 prefix= level_code - 100;
4284                 if(prefix == LEVEL_TAB_BITS){
4285                     prefix += get_level_prefix(gb);
4286                 }
4287                 if(prefix<15){
4288                     level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4289                 }else{
4290                     level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
4291                     if(prefix>=16)
4292                         level_code += (1<<(prefix-3))-4096;
4293                 }
4294                 mask= -(level_code&1);
4295                 level_code= (((2+level_code)>>1) ^ mask) - mask;
4296             }
4297             level[i]= level_code;
4298
4299             if(suffix_limit[suffix_length] + level_code > 2U*suffix_limit[suffix_length])
4300                 suffix_length++;
4301         }
4302     }
4303
4304     if(total_coeff == max_coeff)
4305         zeros_left=0;
4306     else{
4307         if(n == CHROMA_DC_BLOCK_INDEX)
4308             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4309         else
4310             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4311     }
4312
4313     coeff_num = zeros_left + total_coeff - 1;
4314     j = scantable[coeff_num];
4315     if(n > 24){
4316         block[j] = level[0];
4317         for(i=1;i<total_coeff;i++) {
4318             if(zeros_left <= 0)
4319                 run_before = 0;
4320             else if(zeros_left < 7){
4321                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4322             }else{
4323                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4324             }
4325             zeros_left -= run_before;
4326             coeff_num -= 1 + run_before;
4327             j= scantable[ coeff_num ];
4328
4329             block[j]= level[i];
4330         }
4331     }else{
4332         block[j] = (level[0] * qmul[j] + 32)>>6;
4333         for(i=1;i<total_coeff;i++) {
4334             if(zeros_left <= 0)
4335                 run_before = 0;
4336             else if(zeros_left < 7){
4337                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4338             }else{
4339                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4340             }
4341             zeros_left -= run_before;
4342             coeff_num -= 1 + run_before;
4343             j= scantable[ coeff_num ];
4344
4345             block[j]= (level[i] * qmul[j] + 32)>>6;
4346         }
4347     }
4348
4349     if(zeros_left<0){
4350         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4351         return -1;
4352     }
4353
4354     return 0;
4355 }
4356
4357 static void predict_field_decoding_flag(H264Context *h){
4358     MpegEncContext * const s = &h->s;
4359     const int mb_xy= h->mb_xy;
4360     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4361                 ? s->current_picture.mb_type[mb_xy-1]
4362                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4363                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4364                 : 0;
4365     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4366 }
4367
4368 /**
4369  * decodes a P_SKIP or B_SKIP macroblock
4370  */
4371 static void decode_mb_skip(H264Context *h){
4372     MpegEncContext * const s = &h->s;
4373     const int mb_xy= h->mb_xy;
4374     int mb_type=0;
4375
4376     memset(h->non_zero_count[mb_xy], 0, 16);
4377     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4378
4379     if(MB_FIELD)
4380         mb_type|= MB_TYPE_INTERLACED;
4381
4382     if( h->slice_type_nos == FF_B_TYPE )
4383     {
4384         // just for fill_caches. pred_direct_motion will set the real mb_type
4385         mb_type|= MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4386
4387         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4388         pred_direct_motion(h, &mb_type);
4389         mb_type|= MB_TYPE_SKIP;
4390     }
4391     else
4392     {
4393         int mx, my;
4394         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4395
4396         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4397         pred_pskip_motion(h, &mx, &my);
4398         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4399         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4400     }
4401
4402     write_back_motion(h, mb_type);
4403     s->current_picture.mb_type[mb_xy]= mb_type;
4404     s->current_picture.qscale_table[mb_xy]= s->qscale;
4405     h->slice_table[ mb_xy ]= h->slice_num;
4406     h->prev_mb_skipped= 1;
4407 }
4408
4409 /**
4410  * decodes a macroblock
4411  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4412  */
4413 static int decode_mb_cavlc(H264Context *h){
4414     MpegEncContext * const s = &h->s;
4415     int mb_xy;
4416     int partition_count;
4417     unsigned int mb_type, cbp;
4418     int dct8x8_allowed= h->pps.transform_8x8_mode;
4419
4420     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4421
4422     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4423     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4424                 down the code */
4425     if(h->slice_type_nos != FF_I_TYPE){
4426         if(s->mb_skip_run==-1)
4427             s->mb_skip_run= get_ue_golomb(&s->gb);
4428
4429         if (s->mb_skip_run--) {
4430             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4431                 if(s->mb_skip_run==0)
4432                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4433                 else
4434                     predict_field_decoding_flag(h);
4435             }
4436             decode_mb_skip(h);
4437             return 0;
4438         }
4439     }
4440     if(FRAME_MBAFF){
4441         if( (s->mb_y&1) == 0 )
4442             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4443     }
4444
4445     h->prev_mb_skipped= 0;
4446
4447     mb_type= get_ue_golomb(&s->gb);
4448     if(h->slice_type_nos == FF_B_TYPE){
4449         if(mb_type < 23){
4450             partition_count= b_mb_type_info[mb_type].partition_count;
4451             mb_type=         b_mb_type_info[mb_type].type;
4452         }else{
4453             mb_type -= 23;
4454             goto decode_intra_mb;
4455         }
4456     }else if(h->slice_type_nos == FF_P_TYPE){
4457         if(mb_type < 5){
4458             partition_count= p_mb_type_info[mb_type].partition_count;
4459             mb_type=         p_mb_type_info[mb_type].type;
4460         }else{
4461             mb_type -= 5;
4462             goto decode_intra_mb;
4463         }
4464     }else{
4465        assert(h->slice_type_nos == FF_I_TYPE);
4466         if(h->slice_type == FF_SI_TYPE && mb_type)
4467             mb_type--;
4468 decode_intra_mb:
4469         if(mb_type > 25){
4470             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4471             return -1;
4472         }
4473         partition_count=0;
4474         cbp= i_mb_type_info[mb_type].cbp;
4475         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4476         mb_type= i_mb_type_info[mb_type].type;
4477     }
4478
4479     if(MB_FIELD)
4480         mb_type |= MB_TYPE_INTERLACED;
4481
4482     h->slice_table[ mb_xy ]= h->slice_num;
4483
4484     if(IS_INTRA_PCM(mb_type)){
4485         unsigned int x;
4486
4487         // We assume these blocks are very rare so we do not optimize it.
4488         align_get_bits(&s->gb);
4489
4490         // The pixels are stored in the same order as levels in h->mb array.
4491         for(x=0; x < (CHROMA ? 384 : 256); x++){
4492             ((uint8_t*)h->mb)[x]= get_bits(&s->gb, 8);
4493         }
4494
4495         // In deblocking, the quantizer is 0
4496         s->current_picture.qscale_table[mb_xy]= 0;
4497         // All coeffs are present
4498         memset(h->non_zero_count[mb_xy], 16, 16);
4499
4500         s->current_picture.mb_type[mb_xy]= mb_type;
4501         return 0;
4502     }
4503
4504     if(MB_MBAFF){
4505         h->ref_count[0] <<= 1;
4506         h->ref_count[1] <<= 1;
4507     }
4508
4509     fill_caches(h, mb_type, 0);
4510
4511     //mb_pred
4512     if(IS_INTRA(mb_type)){
4513         int pred_mode;
4514 //            init_top_left_availability(h);
4515         if(IS_INTRA4x4(mb_type)){
4516             int i;
4517             int di = 1;
4518             if(dct8x8_allowed && get_bits1(&s->gb)){
4519                 mb_type |= MB_TYPE_8x8DCT;
4520                 di = 4;
4521             }
4522
4523 //                fill_intra4x4_pred_table(h);
4524             for(i=0; i<16; i+=di){
4525                 int mode= pred_intra_mode(h, i);
4526
4527                 if(!get_bits1(&s->gb)){
4528                     const int rem_mode= get_bits(&s->gb, 3);
4529                     mode = rem_mode + (rem_mode >= mode);
4530                 }
4531
4532                 if(di==4)
4533                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4534                 else
4535                     h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4536             }
4537             write_back_intra_pred_mode(h);
4538             if( check_intra4x4_pred_mode(h) < 0)
4539                 return -1;
4540         }else{
4541             h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4542             if(h->intra16x16_pred_mode < 0)
4543                 return -1;
4544         }
4545         if(CHROMA){
4546             pred_mode= check_intra_pred_mode(h, get_ue_golomb_31(&s->gb));
4547             if(pred_mode < 0)
4548                 return -1;
4549             h->chroma_pred_mode= pred_mode;
4550         }
4551     }else if(partition_count==4){
4552         int i, j, sub_partition_count[4], list, ref[2][4];
4553
4554         if(h->slice_type_nos == FF_B_TYPE){
4555             for(i=0; i<4; i++){
4556                 h->sub_mb_type[i]= get_ue_golomb_31(&s->gb);
4557                 if(h->sub_mb_type[i] >=13){
4558                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4559                     return -1;
4560                 }
4561                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4562                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4563             }
4564             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4565                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4566                 pred_direct_motion(h, &mb_type);
4567                 h->ref_cache[0][scan8[4]] =
4568                 h->ref_cache[1][scan8[4]] =
4569                 h->ref_cache[0][scan8[12]] =
4570                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4571             }
4572         }else{
4573             assert(h->slice_type_nos == FF_P_TYPE); //FIXME SP correct ?
4574             for(i=0; i<4; i++){
4575                 h->sub_mb_type[i]= get_ue_golomb_31(&s->gb);
4576                 if(h->sub_mb_type[i] >=4){
4577                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4578                     return -1;
4579                 }
4580                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4581                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4582             }
4583         }
4584
4585         for(list=0; list<h->list_count; list++){
4586             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4587             for(i=0; i<4; i++){
4588                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4589                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4590                     unsigned int tmp;
4591                     if(ref_count == 1){
4592                         tmp= 0;
4593                     }else if(ref_count == 2){
4594                         tmp= get_bits1(&s->gb)^1;
4595                     }else{
4596                         tmp= get_ue_golomb_31(&s->gb);
4597                         if(tmp>=ref_count){
4598                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4599                             return -1;
4600                         }
4601                     }
4602                     ref[list][i]= tmp;
4603                 }else{
4604                  //FIXME
4605                     ref[list][i] = -1;
4606                 }
4607             }
4608         }
4609
4610         if(dct8x8_allowed)
4611             dct8x8_allowed = get_dct8x8_allowed(h);
4612
4613         for(list=0; list<h->list_count; list++){
4614             for(i=0; i<4; i++){
4615                 if(IS_DIRECT(h->sub_mb_type[i])) {
4616                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4617                     continue;
4618                 }
4619                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4620                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4621
4622                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4623                     const int sub_mb_type= h->sub_mb_type[i];
4624                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4625                     for(j=0; j<sub_partition_count[i]; j++){
4626                         int mx, my;
4627                         const int index= 4*i + block_width*j;
4628                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4629                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4630                         mx += get_se_golomb(&s->gb);
4631                         my += get_se_golomb(&s->gb);
4632                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4633
4634                         if(IS_SUB_8X8(sub_mb_type)){
4635                             mv_cache[ 1 ][0]=
4636                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4637                             mv_cache[ 1 ][1]=
4638                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4639                         }else if(IS_SUB_8X4(sub_mb_type)){
4640                             mv_cache[ 1 ][0]= mx;
4641                             mv_cache[ 1 ][1]= my;
4642                         }else if(IS_SUB_4X8(sub_mb_type)){
4643                             mv_cache[ 8 ][0]= mx;
4644                             mv_cache[ 8 ][1]= my;
4645                         }
4646                         mv_cache[ 0 ][0]= mx;
4647                         mv_cache[ 0 ][1]= my;
4648                     }
4649                 }else{
4650                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4651                     p[0] = p[1]=
4652                     p[8] = p[9]= 0;
4653                 }
4654             }
4655         }
4656     }else if(IS_DIRECT(mb_type)){
4657         pred_direct_motion(h, &mb_type);
4658         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4659     }else{
4660         int list, mx, my, i;
4661          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4662         if(IS_16X16(mb_type)){
4663             for(list=0; list<h->list_count; list++){
4664                     unsigned int val;
4665                     if(IS_DIR(mb_type, 0, list)){
4666                         if(h->ref_count[list]==1){
4667                             val= 0;
4668                         }else if(h->ref_count[list]==2){
4669                             val= get_bits1(&s->gb)^1;
4670                         }else{
4671                             val= get_ue_golomb_31(&s->gb);
4672                             if(val >= h->ref_count[list]){
4673                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4674                                 return -1;
4675                             }
4676                         }
4677                     }else
4678                         val= LIST_NOT_USED&0xFF;
4679                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4680             }
4681             for(list=0; list<h->list_count; list++){
4682                 unsigned int val;
4683                 if(IS_DIR(mb_type, 0, list)){
4684                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4685                     mx += get_se_golomb(&s->gb);
4686                     my += get_se_golomb(&s->gb);
4687                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4688
4689                     val= pack16to32(mx,my);
4690                 }else
4691                     val=0;
4692                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4693             }
4694         }
4695         else if(IS_16X8(mb_type)){
4696             for(list=0; list<h->list_count; list++){
4697                     for(i=0; i<2; i++){
4698                         unsigned int val;
4699                         if(IS_DIR(mb_type, i, list)){
4700                             if(h->ref_count[list] == 1){
4701                                 val= 0;
4702                             }else if(h->ref_count[list] == 2){
4703                                 val= get_bits1(&s->gb)^1;
4704                             }else{
4705                                 val= get_ue_golomb_31(&s->gb);
4706                                 if(val >= h->ref_count[list]){
4707                                     av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4708                                     return -1;
4709                                 }
4710                             }
4711                         }else
4712                             val= LIST_NOT_USED&0xFF;
4713                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4714                     }
4715             }
4716             for(list=0; list<h->list_count; list++){
4717                 for(i=0; i<2; i++){
4718                     unsigned int val;
4719                     if(IS_DIR(mb_type, i, list)){
4720                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4721                         mx += get_se_golomb(&s->gb);
4722                         my += get_se_golomb(&s->gb);
4723                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4724
4725                         val= pack16to32(mx,my);
4726                     }else
4727                         val=0;
4728                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4729                 }
4730             }
4731         }else{
4732             assert(IS_8X16(mb_type));
4733             for(list=0; list<h->list_count; list++){
4734                     for(i=0; i<2; i++){
4735                         unsigned int val;
4736                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4737                             if(h->ref_count[list]==1){
4738                                 val= 0;
4739                             }else if(h->ref_count[list]==2){
4740                                 val= get_bits1(&s->gb)^1;
4741                             }else{
4742                                 val= get_ue_golomb_31(&s->gb);
4743                                 if(val >= h->ref_count[list]){
4744                                     av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4745                                     return -1;
4746                                 }
4747                             }
4748                         }else
4749                             val= LIST_NOT_USED&0xFF;
4750                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4751                     }
4752             }
4753             for(list=0; list<h->list_count; list++){
4754                 for(i=0; i<2; i++){
4755                     unsigned int val;
4756                     if(IS_DIR(mb_type, i, list)){
4757                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4758                         mx += get_se_golomb(&s->gb);
4759                         my += get_se_golomb(&s->gb);
4760                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4761
4762                         val= pack16to32(mx,my);
4763                     }else
4764                         val=0;
4765                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4766                 }
4767             }
4768         }
4769     }
4770
4771     if(IS_INTER(mb_type))
4772         write_back_motion(h, mb_type);
4773
4774     if(!IS_INTRA16x16(mb_type)){
4775         cbp= get_ue_golomb(&s->gb);
4776         if(cbp > 47){
4777             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4778             return -1;
4779         }
4780
4781         if(CHROMA){
4782             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp[cbp];
4783             else                     cbp= golomb_to_inter_cbp   [cbp];
4784         }else{
4785             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp_gray[cbp];
4786             else                     cbp= golomb_to_inter_cbp_gray[cbp];
4787         }
4788     }
4789     h->cbp = cbp;
4790
4791     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4792         if(get_bits1(&s->gb)){
4793             mb_type |= MB_TYPE_8x8DCT;
4794             h->cbp_table[mb_xy]= cbp;
4795         }
4796     }
4797     s->current_picture.mb_type[mb_xy]= mb_type;
4798
4799     if(cbp || IS_INTRA16x16(mb_type)){
4800         int i8x8, i4x4, chroma_idx;
4801         int dquant;
4802         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4803         const uint8_t *scan, *scan8x8, *dc_scan;
4804
4805 //        fill_non_zero_count_cache(h);
4806
4807         if(IS_INTERLACED(mb_type)){
4808             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4809             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4810             dc_scan= luma_dc_field_scan;
4811         }else{
4812             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4813             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4814             dc_scan= luma_dc_zigzag_scan;
4815         }
4816
4817         dquant= get_se_golomb(&s->gb);
4818
4819         if( dquant > 25 || dquant < -26 ){
4820             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4821             return -1;
4822         }
4823
4824         s->qscale += dquant;
4825         if(((unsigned)s->qscale) > 51){
4826             if(s->qscale<0) s->qscale+= 52;
4827             else            s->qscale-= 52;
4828         }
4829
4830         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4831         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4832         if(IS_INTRA16x16(mb_type)){
4833             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4834                 return -1; //FIXME continue if partitioned and other return -1 too
4835             }
4836
4837             assert((cbp&15) == 0 || (cbp&15) == 15);
4838
4839             if(cbp&15){
4840                 for(i8x8=0; i8x8<4; i8x8++){
4841                     for(i4x4=0; i4x4<4; i4x4++){
4842                         const int index= i4x4 + 4*i8x8;
4843                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4844                             return -1;
4845                         }
4846                     }
4847                 }
4848             }else{
4849                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4850             }
4851         }else{
4852             for(i8x8=0; i8x8<4; i8x8++){
4853                 if(cbp & (1<<i8x8)){
4854                     if(IS_8x8DCT(mb_type)){
4855                         DCTELEM *buf = &h->mb[64*i8x8];
4856                         uint8_t *nnz;
4857                         for(i4x4=0; i4x4<4; i4x4++){
4858                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4859                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4860                                 return -1;
4861                         }
4862                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4863                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4864                     }else{
4865                         for(i4x4=0; i4x4<4; i4x4++){
4866                             const int index= i4x4 + 4*i8x8;
4867
4868                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4869                                 return -1;
4870                             }
4871                         }
4872                     }
4873                 }else{
4874                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4875                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4876                 }
4877             }
4878         }
4879
4880         if(cbp&0x30){
4881             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4882                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4883                     return -1;
4884                 }
4885         }
4886
4887         if(cbp&0x20){
4888             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4889                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4890                 for(i4x4=0; i4x4<4; i4x4++){
4891                     const int index= 16 + 4*chroma_idx + i4x4;
4892                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4893                         return -1;
4894                     }
4895                 }
4896             }
4897         }else{
4898             uint8_t * const nnz= &h->non_zero_count_cache[0];
4899             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4900             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4901         }
4902     }else{
4903         uint8_t * const nnz= &h->non_zero_count_cache[0];
4904         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4905         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4906         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4907     }
4908     s->current_picture.qscale_table[mb_xy]= s->qscale;
4909     write_back_non_zero_count(h);
4910
4911     if(MB_MBAFF){
4912         h->ref_count[0] >>= 1;
4913         h->ref_count[1] >>= 1;
4914     }
4915
4916     return 0;
4917 }
4918
4919 static int decode_cabac_field_decoding_flag(H264Context *h) {
4920     MpegEncContext * const s = &h->s;
4921     const int mb_x = s->mb_x;
4922     const int mb_y = s->mb_y & ~1;
4923     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4924     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4925
4926     unsigned int ctx = 0;
4927
4928     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
4929         ctx += 1;
4930     }
4931     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
4932         ctx += 1;
4933     }
4934
4935     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
4936 }
4937
4938 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4939     uint8_t *state= &h->cabac_state[ctx_base];
4940     int mb_type;
4941
4942     if(intra_slice){
4943         MpegEncContext * const s = &h->s;
4944         const int mba_xy = h->left_mb_xy[0];
4945         const int mbb_xy = h->top_mb_xy;
4946         int ctx=0;
4947         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
4948             ctx++;
4949         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
4950             ctx++;
4951         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
4952             return 0;   /* I4x4 */
4953         state += 2;
4954     }else{
4955         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
4956             return 0;   /* I4x4 */
4957     }
4958
4959     if( get_cabac_terminate( &h->cabac ) )
4960         return 25;  /* PCM */
4961
4962     mb_type = 1; /* I16x16 */
4963     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
4964     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
4965         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
4966     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
4967     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
4968     return mb_type;
4969 }
4970
4971 static int decode_cabac_mb_type_b( H264Context *h ) {
4972     MpegEncContext * const s = &h->s;
4973
4974         const int mba_xy = h->left_mb_xy[0];
4975         const int mbb_xy = h->top_mb_xy;
4976         int ctx = 0;
4977         int bits;
4978         assert(h->slice_type_nos == FF_B_TYPE);
4979
4980         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
4981             ctx++;
4982         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
4983             ctx++;
4984
4985         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
4986             return 0; /* B_Direct_16x16 */
4987
4988         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
4989             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
4990         }
4991
4992         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
4993         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
4994         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
4995         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4996         if( bits < 8 )
4997             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
4998         else if( bits == 13 ) {
4999             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
5000         } else if( bits == 14 )
5001             return 11; /* B_L1_L0_8x16 */
5002         else if( bits == 15 )
5003             return 22; /* B_8x8 */
5004
5005         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5006         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
5007 }
5008
5009 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
5010     MpegEncContext * const s = &h->s;
5011     int mba_xy, mbb_xy;
5012     int ctx = 0;
5013
5014     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
5015         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
5016         mba_xy = mb_xy - 1;
5017         if( (mb_y&1)
5018             && h->slice_table[mba_xy] == h->slice_num
5019             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
5020             mba_xy += s->mb_stride;
5021         if( MB_FIELD ){
5022             mbb_xy = mb_xy - s->mb_stride;
5023             if( !(mb_y&1)
5024                 && h->slice_table[mbb_xy] == h->slice_num
5025                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
5026                 mbb_xy -= s->mb_stride;
5027         }else
5028             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
5029     }else{
5030         int mb_xy = h->mb_xy;
5031         mba_xy = mb_xy - 1;
5032         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
5033     }
5034
5035     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
5036         ctx++;
5037     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
5038         ctx++;
5039
5040     if( h->slice_type_nos == FF_B_TYPE )
5041         ctx += 13;
5042     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
5043 }
5044
5045 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
5046     int mode = 0;
5047
5048     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
5049         return pred_mode;
5050
5051     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
5052     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
5053     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
5054
5055     if( mode >= pred_mode )
5056         return mode + 1;
5057     else
5058         return mode;
5059 }
5060
5061 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
5062     const int mba_xy = h->left_mb_xy[0];
5063     const int mbb_xy = h->top_mb_xy;
5064
5065     int ctx = 0;
5066
5067     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
5068     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
5069         ctx++;
5070
5071     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
5072         ctx++;
5073
5074     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
5075         return 0;
5076
5077     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5078         return 1;
5079     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5080         return 2;
5081     else
5082         return 3;
5083 }
5084
5085 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5086     int cbp_b, cbp_a, ctx, cbp = 0;
5087
5088     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
5089     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
5090
5091     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
5092     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
5093     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
5094     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
5095     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
5096     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
5097     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
5098     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
5099     return cbp;
5100 }
5101 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5102     int ctx;
5103     int cbp_a, cbp_b;
5104
5105     cbp_a = (h->left_cbp>>4)&0x03;
5106     cbp_b = (h-> top_cbp>>4)&0x03;
5107
5108     ctx = 0;
5109     if( cbp_a > 0 ) ctx++;
5110     if( cbp_b > 0 ) ctx += 2;
5111     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5112         return 0;
5113
5114     ctx = 4;
5115     if( cbp_a == 2 ) ctx++;
5116     if( cbp_b == 2 ) ctx += 2;
5117     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5118 }
5119 static int decode_cabac_mb_dqp( H264Context *h) {
5120     int   ctx= h->last_qscale_diff != 0;
5121     int   val = 0;
5122
5123     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5124         ctx= 2+(ctx>>1);
5125         val++;
5126         if(val > 102) //prevent infinite loop
5127             return INT_MIN;
5128     }
5129
5130     if( val&0x01 )
5131         return   (val + 1)>>1 ;
5132     else
5133         return -((val + 1)>>1);
5134 }
5135 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5136     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5137         return 0;   /* 8x8 */
5138     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5139         return 1;   /* 8x4 */
5140     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5141         return 2;   /* 4x8 */
5142     return 3;       /* 4x4 */
5143 }
5144 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5145     int type;
5146     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5147         return 0;   /* B_Direct_8x8 */
5148     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5149         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5150     type = 3;
5151     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5152         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5153             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5154         type += 4;
5155     }
5156     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5157     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5158     return type;
5159 }
5160
5161 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5162     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5163 }
5164
5165 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5166     int refa = h->ref_cache[list][scan8[n] - 1];
5167     int refb = h->ref_cache[list][scan8[n] - 8];
5168     int ref  = 0;
5169     int ctx  = 0;
5170
5171     if( h->slice_type_nos == FF_B_TYPE) {
5172         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5173             ctx++;
5174         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5175             ctx += 2;
5176     } else {
5177         if( refa > 0 )
5178             ctx++;
5179         if( refb > 0 )
5180             ctx += 2;
5181     }
5182
5183     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5184         ref++;
5185         ctx = (ctx>>2)+4;
5186         if(ref >= 32 /*h->ref_list[list]*/){
5187             return -1;
5188         }
5189     }
5190     return ref;
5191 }
5192
5193 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5194     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5195                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5196     int ctxbase = (l == 0) ? 40 : 47;
5197     int mvd;
5198     int ctx = (amvd>2) + (amvd>32);
5199
5200     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5201         return 0;
5202
5203     mvd= 1;
5204     ctx= 3;
5205     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5206         mvd++;
5207         if( ctx < 6 )
5208             ctx++;
5209     }
5210
5211     if( mvd >= 9 ) {
5212         int k = 3;
5213         while( get_cabac_bypass( &h->cabac ) ) {
5214             mvd += 1 << k;
5215             k++;
5216             if(k>24){
5217                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5218                 return INT_MIN;
5219             }
5220         }
5221         while( k-- ) {
5222             if( get_cabac_bypass( &h->cabac ) )
5223                 mvd += 1 << k;
5224         }
5225     }
5226     return get_cabac_bypass_sign( &h->cabac, -mvd );
5227 }
5228
5229 static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
5230     int nza, nzb;
5231     int ctx = 0;
5232
5233     if( is_dc ) {
5234         if( cat == 0 ) {
5235             nza = h->left_cbp&0x100;
5236             nzb = h-> top_cbp&0x100;
5237         } else {
5238             nza = (h->left_cbp>>(6+idx))&0x01;
5239             nzb = (h-> top_cbp>>(6+idx))&0x01;
5240         }
5241     } else {
5242         assert(cat == 1 || cat == 2 || cat == 4);
5243         nza = h->non_zero_count_cache[scan8[idx] - 1];
5244         nzb = h->non_zero_count_cache[scan8[idx] - 8];
5245     }
5246
5247     if( nza > 0 )
5248         ctx++;
5249
5250     if( nzb > 0 )
5251         ctx += 2;
5252
5253     return ctx + 4 * cat;
5254 }
5255
5256 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8[63]) = {
5257     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5258     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5259     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5260     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5261 };
5262
5263 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
5264     static const int significant_coeff_flag_offset[2][6] = {
5265       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5266       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5267     };
5268     static const int last_coeff_flag_offset[2][6] = {
5269       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5270       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5271     };
5272     static const int coeff_abs_level_m1_offset[6] = {
5273         227+0, 227+10, 227+20, 227+30, 227+39, 426
5274     };
5275     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5276       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5277         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5278         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5279        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5280       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5281         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5282         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5283         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5284     };
5285     /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
5286      * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
5287      * map node ctx => cabac ctx for level=1 */
5288     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
5289     /* map node ctx => cabac ctx for level>1 */
5290     static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
5291     static const uint8_t coeff_abs_level_transition[2][8] = {
5292     /* update node ctx after decoding a level=1 */
5293         { 1, 2, 3, 3, 4, 5, 6, 7 },
5294     /* update node ctx after decoding a level>1 */
5295         { 4, 4, 4, 4, 5, 6, 7, 7 }
5296     };
5297
5298     int index[64];
5299
5300     int av_unused last;
5301     int coeff_count = 0;
5302     int node_ctx = 0;
5303
5304     uint8_t *significant_coeff_ctx_base;
5305     uint8_t *last_coeff_ctx_base;
5306     uint8_t *abs_level_m1_ctx_base;
5307
5308 #if !ARCH_X86
5309 #define CABAC_ON_STACK
5310 #endif
5311 #ifdef CABAC_ON_STACK
5312 #define CC &cc
5313     CABACContext cc;
5314     cc.range     = h->cabac.range;
5315     cc.low       = h->cabac.low;
5316     cc.bytestream= h->cabac.bytestream;
5317 #else
5318 #define CC &h->cabac
5319 #endif
5320
5321
5322     /* cat: 0-> DC 16x16  n = 0
5323      *      1-> AC 16x16  n = luma4x4idx
5324      *      2-> Luma4x4   n = luma4x4idx
5325      *      3-> DC Chroma n = iCbCr
5326      *      4-> AC Chroma n = 16 + 4 * iCbCr + chroma4x4idx
5327      *      5-> Luma8x8   n = 4 * luma8x8idx
5328      */
5329
5330     /* read coded block flag */
5331     if( is_dc || cat != 5 ) {
5332         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
5333             if( !is_dc )
5334                 h->non_zero_count_cache[scan8[n]] = 0;
5335
5336 #ifdef CABAC_ON_STACK
5337             h->cabac.range     = cc.range     ;
5338             h->cabac.low       = cc.low       ;
5339             h->cabac.bytestream= cc.bytestream;
5340 #endif
5341             return;
5342         }
5343     }
5344
5345     significant_coeff_ctx_base = h->cabac_state
5346         + significant_coeff_flag_offset[MB_FIELD][cat];
5347     last_coeff_ctx_base = h->cabac_state
5348         + last_coeff_flag_offset[MB_FIELD][cat];
5349     abs_level_m1_ctx_base = h->cabac_state
5350         + coeff_abs_level_m1_offset[cat];
5351
5352     if( !is_dc && cat == 5 ) {
5353 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5354         for(last= 0; last < coefs; last++) { \
5355             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5356             if( get_cabac( CC, sig_ctx )) { \
5357                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5358                 index[coeff_count++] = last; \
5359                 if( get_cabac( CC, last_ctx ) ) { \
5360                     last= max_coeff; \
5361                     break; \
5362                 } \
5363             } \
5364         }\
5365         if( last == max_coeff -1 ) {\
5366             index[coeff_count++] = last;\
5367         }
5368         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5369 #if ARCH_X86 && HAVE_7REGS && HAVE_EBX_AVAILABLE && !defined(BROKEN_RELOCATIONS)
5370         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5371     } else {
5372         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5373 #else
5374         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5375     } else {
5376         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5377 #endif
5378     }
5379     assert(coeff_count > 0);
5380
5381     if( is_dc ) {
5382         if( cat == 0 )
5383             h->cbp_table[h->mb_xy] |= 0x100;
5384         else
5385             h->cbp_table[h->mb_xy] |= 0x40 << n;
5386     } else {
5387         if( cat == 5 )
5388             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5389         else {
5390             assert( cat == 1 || cat == 2 || cat == 4 );
5391             h->non_zero_count_cache[scan8[n]] = coeff_count;
5392         }
5393     }
5394
5395     do {
5396         uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
5397
5398         int j= scantable[index[--coeff_count]];
5399
5400         if( get_cabac( CC, ctx ) == 0 ) {
5401             node_ctx = coeff_abs_level_transition[0][node_ctx];
5402             if( is_dc ) {
5403                 block[j] = get_cabac_bypass_sign( CC, -1);
5404             }else{
5405                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5406             }
5407         } else {
5408             int coeff_abs = 2;
5409             ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
5410             node_ctx = coeff_abs_level_transition[1][node_ctx];
5411
5412             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5413                 coeff_abs++;
5414             }
5415
5416             if( coeff_abs >= 15 ) {
5417                 int j = 0;
5418                 while( get_cabac_bypass( CC ) ) {
5419                     j++;
5420                 }
5421
5422                 coeff_abs=1;
5423                 while( j-- ) {
5424                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5425                 }
5426                 coeff_abs+= 14;
5427             }
5428
5429             if( is_dc ) {
5430                 block[j] = get_cabac_bypass_sign( CC, -coeff_abs );
5431             }else{
5432                 block[j] = (get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32) >> 6;
5433             }
5434         }
5435     } while( coeff_count );
5436 #ifdef CABAC_ON_STACK
5437             h->cabac.range     = cc.range     ;
5438             h->cabac.low       = cc.low       ;
5439             h->cabac.bytestream= cc.bytestream;
5440 #endif
5441
5442 }
5443
5444 #if !CONFIG_SMALL
5445 static void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5446     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 1);
5447 }
5448
5449 static void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5450     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
5451 }
5452 #endif
5453
5454 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5455 #if CONFIG_SMALL
5456     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, cat == 0 || cat == 3);
5457 #else
5458     if( cat == 0 || cat == 3 ) decode_cabac_residual_dc(h, block, cat, n, scantable, qmul, max_coeff);
5459     else decode_cabac_residual_nondc(h, block, cat, n, scantable, qmul, max_coeff);
5460 #endif
5461 }
5462
5463 static inline void compute_mb_neighbors(H264Context *h)
5464 {
5465     MpegEncContext * const s = &h->s;
5466     const int mb_xy  = h->mb_xy;
5467     h->top_mb_xy     = mb_xy - s->mb_stride;
5468     h->left_mb_xy[0] = mb_xy - 1;
5469     if(FRAME_MBAFF){
5470         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5471         const int top_pair_xy      = pair_xy     - s->mb_stride;
5472         const int top_mb_field_flag  = IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5473         const int left_mb_field_flag = IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5474         const int curr_mb_field_flag = MB_FIELD;
5475         const int bottom = (s->mb_y & 1);
5476
5477         if (curr_mb_field_flag && (bottom || top_mb_field_flag)){
5478             h->top_mb_xy -= s->mb_stride;
5479         }
5480         if (!left_mb_field_flag == curr_mb_field_flag) {
5481             h->left_mb_xy[0] = pair_xy - 1;
5482         }
5483     } else if (FIELD_PICTURE) {
5484         h->top_mb_xy -= s->mb_stride;
5485     }
5486     return;
5487 }
5488
5489 /**
5490  * decodes a macroblock
5491  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5492  */
5493 static int decode_mb_cabac(H264Context *h) {
5494     MpegEncContext * const s = &h->s;
5495     int mb_xy;
5496     int mb_type, partition_count, cbp = 0;
5497     int dct8x8_allowed= h->pps.transform_8x8_mode;
5498
5499     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5500
5501     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5502     if( h->slice_type_nos != FF_I_TYPE ) {
5503         int skip;
5504         /* a skipped mb needs the aff flag from the following mb */
5505         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5506             predict_field_decoding_flag(h);
5507         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5508             skip = h->next_mb_skipped;
5509         else
5510             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5511         /* read skip flags */
5512         if( skip ) {
5513             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5514                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5515                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5516                 if(!h->next_mb_skipped)
5517                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5518             }
5519
5520             decode_mb_skip(h);
5521
5522             h->cbp_table[mb_xy] = 0;
5523             h->chroma_pred_mode_table[mb_xy] = 0;
5524             h->last_qscale_diff = 0;
5525
5526             return 0;
5527
5528         }
5529     }
5530     if(FRAME_MBAFF){
5531         if( (s->mb_y&1) == 0 )
5532             h->mb_mbaff =
5533             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5534     }
5535
5536     h->prev_mb_skipped = 0;
5537
5538     compute_mb_neighbors(h);
5539
5540     if( h->slice_type_nos == FF_B_TYPE ) {
5541         mb_type = decode_cabac_mb_type_b( h );
5542         if( mb_type < 23 ){
5543             partition_count= b_mb_type_info[mb_type].partition_count;
5544             mb_type=         b_mb_type_info[mb_type].type;
5545         }else{
5546             mb_type -= 23;
5547             goto decode_intra_mb;
5548         }
5549     } else if( h->slice_type_nos == FF_P_TYPE ) {
5550         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
5551             /* P-type */
5552             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
5553                 /* P_L0_D16x16, P_8x8 */
5554                 mb_type= 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
5555             } else {
5556                 /* P_L0_D8x16, P_L0_D16x8 */
5557                 mb_type= 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
5558             }
5559             partition_count= p_mb_type_info[mb_type].partition_count;
5560             mb_type=         p_mb_type_info[mb_type].type;
5561         } else {
5562             mb_type= decode_cabac_intra_mb_type(h, 17, 0);
5563             goto decode_intra_mb;
5564         }
5565     } else {
5566         mb_type= decode_cabac_intra_mb_type(h, 3, 1);
5567         if(h->slice_type == FF_SI_TYPE && mb_type)
5568             mb_type--;
5569         assert(h->slice_type_nos == FF_I_TYPE);
5570 decode_intra_mb:
5571         partition_count = 0;
5572         cbp= i_mb_type_info[mb_type].cbp;
5573         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5574         mb_type= i_mb_type_info[mb_type].type;
5575     }
5576     if(MB_FIELD)
5577         mb_type |= MB_TYPE_INTERLACED;
5578
5579     h->slice_table[ mb_xy ]= h->slice_num;
5580
5581     if(IS_INTRA_PCM(mb_type)) {
5582         const uint8_t *ptr;
5583
5584         // We assume these blocks are very rare so we do not optimize it.
5585         // FIXME The two following lines get the bitstream position in the cabac
5586         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5587         ptr= h->cabac.bytestream;
5588         if(h->cabac.low&0x1) ptr--;
5589         if(CABAC_BITS==16){
5590             if(h->cabac.low&0x1FF) ptr--;
5591         }
5592
5593         // The pixels are stored in the same order as levels in h->mb array.
5594         memcpy(h->mb, ptr, 256); ptr+=256;
5595         if(CHROMA){
5596             memcpy(h->mb+128, ptr, 128); ptr+=128;
5597         }
5598
5599         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5600
5601         // All blocks are present
5602         h->cbp_table[mb_xy] = 0x1ef;
5603         h->chroma_pred_mode_table[mb_xy] = 0;
5604         // In deblocking, the quantizer is 0
5605         s->current_picture.qscale_table[mb_xy]= 0;
5606         // All coeffs are present
5607         memset(h->non_zero_count[mb_xy], 16, 16);
5608         s->current_picture.mb_type[mb_xy]= mb_type;
5609         h->last_qscale_diff = 0;
5610         return 0;
5611     }
5612
5613     if(MB_MBAFF){
5614         h->ref_count[0] <<= 1;
5615         h->ref_count[1] <<= 1;
5616     }
5617
5618     fill_caches(h, mb_type, 0);
5619
5620     if( IS_INTRA( mb_type ) ) {
5621         int i, pred_mode;
5622         if( IS_INTRA4x4( mb_type ) ) {
5623             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5624                 mb_type |= MB_TYPE_8x8DCT;
5625                 for( i = 0; i < 16; i+=4 ) {
5626                     int pred = pred_intra_mode( h, i );
5627                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5628                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5629                 }
5630             } else {
5631                 for( i = 0; i < 16; i++ ) {
5632                     int pred = pred_intra_mode( h, i );
5633                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5634
5635                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5636                 }
5637             }
5638             write_back_intra_pred_mode(h);
5639             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5640         } else {
5641             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5642             if( h->intra16x16_pred_mode < 0 ) return -1;
5643         }
5644         if(CHROMA){
5645             h->chroma_pred_mode_table[mb_xy] =
5646             pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5647
5648             pred_mode= check_intra_pred_mode( h, pred_mode );
5649             if( pred_mode < 0 ) return -1;
5650             h->chroma_pred_mode= pred_mode;
5651         }
5652     } else if( partition_count == 4 ) {
5653         int i, j, sub_partition_count[4], list, ref[2][4];
5654
5655         if( h->slice_type_nos == FF_B_TYPE ) {
5656             for( i = 0; i < 4; i++ ) {
5657                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5658                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5659                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5660             }
5661             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5662                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5663                 pred_direct_motion(h, &mb_type);
5664                 h->ref_cache[0][scan8[4]] =
5665                 h->ref_cache[1][scan8[4]] =
5666                 h->ref_cache[0][scan8[12]] =
5667                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5668                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5669                     for( i = 0; i < 4; i++ )
5670                         if( IS_DIRECT(h->sub_mb_type[i]) )
5671                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5672                 }
5673             }
5674         } else {
5675             for( i = 0; i < 4; i++ ) {
5676                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5677                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5678                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5679             }
5680         }
5681
5682         for( list = 0; list < h->list_count; list++ ) {
5683                 for( i = 0; i < 4; i++ ) {
5684                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5685                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5686                         if( h->ref_count[list] > 1 ){
5687                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5688                             if(ref[list][i] >= (unsigned)h->ref_count[list]){
5689                                 av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref[list][i], h->ref_count[list]);
5690                                 return -1;
5691                             }
5692                         }else
5693                             ref[list][i] = 0;
5694                     } else {
5695                         ref[list][i] = -1;
5696                     }
5697                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5698                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5699                 }
5700         }
5701
5702         if(dct8x8_allowed)
5703             dct8x8_allowed = get_dct8x8_allowed(h);
5704
5705         for(list=0; list<h->list_count; list++){
5706             for(i=0; i<4; i++){
5707                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5708                 if(IS_DIRECT(h->sub_mb_type[i])){
5709                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5710                     continue;
5711                 }
5712
5713                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5714                     const int sub_mb_type= h->sub_mb_type[i];
5715                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5716                     for(j=0; j<sub_partition_count[i]; j++){
5717                         int mpx, mpy;
5718                         int mx, my;
5719                         const int index= 4*i + block_width*j;
5720                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5721                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5722                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5723
5724                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5725                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5726                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5727
5728                         if(IS_SUB_8X8(sub_mb_type)){
5729                             mv_cache[ 1 ][0]=
5730                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5731                             mv_cache[ 1 ][1]=
5732                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5733
5734                             mvd_cache[ 1 ][0]=
5735                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5736                             mvd_cache[ 1 ][1]=
5737                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5738                         }else if(IS_SUB_8X4(sub_mb_type)){
5739                             mv_cache[ 1 ][0]= mx;
5740                             mv_cache[ 1 ][1]= my;
5741
5742                             mvd_cache[ 1 ][0]= mx - mpx;
5743                             mvd_cache[ 1 ][1]= my - mpy;
5744                         }else if(IS_SUB_4X8(sub_mb_type)){
5745                             mv_cache[ 8 ][0]= mx;
5746                             mv_cache[ 8 ][1]= my;
5747
5748                             mvd_cache[ 8 ][0]= mx - mpx;
5749                             mvd_cache[ 8 ][1]= my - mpy;
5750                         }
5751                         mv_cache[ 0 ][0]= mx;
5752                         mv_cache[ 0 ][1]= my;
5753
5754                         mvd_cache[ 0 ][0]= mx - mpx;
5755                         mvd_cache[ 0 ][1]= my - mpy;
5756                     }
5757                 }else{
5758                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5759                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5760                     p[0] = p[1] = p[8] = p[9] = 0;
5761                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5762                 }
5763             }
5764         }
5765     } else if( IS_DIRECT(mb_type) ) {
5766         pred_direct_motion(h, &mb_type);
5767         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5768         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5769         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5770     } else {
5771         int list, mx, my, i, mpx, mpy;
5772         if(IS_16X16(mb_type)){
5773             for(list=0; list<h->list_count; list++){
5774                 if(IS_DIR(mb_type, 0, list)){
5775                     int ref;
5776                     if(h->ref_count[list] > 1){
5777                         ref= decode_cabac_mb_ref(h, list, 0);
5778                         if(ref >= (unsigned)h->ref_count[list]){
5779                             av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5780                             return -1;
5781                         }
5782                     }else
5783                         ref=0;
5784                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5785                 }else
5786                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5787             }
5788             for(list=0; list<h->list_count; list++){
5789                 if(IS_DIR(mb_type, 0, list)){
5790                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5791
5792                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5793                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5794                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5795
5796                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5797                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5798                 }else
5799                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5800             }
5801         }
5802         else if(IS_16X8(mb_type)){
5803             for(list=0; list<h->list_count; list++){
5804                     for(i=0; i<2; i++){
5805                         if(IS_DIR(mb_type, i, list)){
5806                             int ref;
5807                             if(h->ref_count[list] > 1){
5808                                 ref= decode_cabac_mb_ref( h, list, 8*i );
5809                                 if(ref >= (unsigned)h->ref_count[list]){
5810                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5811                                     return -1;
5812                                 }
5813                             }else
5814                                 ref=0;
5815                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5816                         }else
5817                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5818                     }
5819             }
5820             for(list=0; list<h->list_count; list++){
5821                 for(i=0; i<2; i++){
5822                     if(IS_DIR(mb_type, i, list)){
5823                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5824                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5825                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5826                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5827
5828                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5829                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5830                     }else{
5831                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5832                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5833                     }
5834                 }
5835             }
5836         }else{
5837             assert(IS_8X16(mb_type));
5838             for(list=0; list<h->list_count; list++){
5839                     for(i=0; i<2; i++){
5840                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5841                             int ref;
5842                             if(h->ref_count[list] > 1){
5843                                 ref= decode_cabac_mb_ref( h, list, 4*i );
5844                                 if(ref >= (unsigned)h->ref_count[list]){
5845                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5846                                     return -1;
5847                                 }
5848                             }else
5849                                 ref=0;
5850                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5851                         }else
5852                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5853                     }
5854             }
5855             for(list=0; list<h->list_count; list++){
5856                 for(i=0; i<2; i++){
5857                     if(IS_DIR(mb_type, i, list)){
5858                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5859                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5860                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5861
5862                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5863                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5864                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5865                     }else{
5866                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5867                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5868                     }
5869                 }
5870             }
5871         }
5872     }
5873
5874    if( IS_INTER( mb_type ) ) {
5875         h->chroma_pred_mode_table[mb_xy] = 0;
5876         write_back_motion( h, mb_type );
5877    }
5878
5879     if( !IS_INTRA16x16( mb_type ) ) {
5880         cbp  = decode_cabac_mb_cbp_luma( h );
5881         if(CHROMA)
5882             cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5883     }
5884
5885     h->cbp_table[mb_xy] = h->cbp = cbp;
5886
5887     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5888         if( decode_cabac_mb_transform_size( h ) )
5889             mb_type |= MB_TYPE_8x8DCT;
5890     }
5891     s->current_picture.mb_type[mb_xy]= mb_type;
5892
5893     if( cbp || IS_INTRA16x16( mb_type ) ) {
5894         const uint8_t *scan, *scan8x8, *dc_scan;
5895         const uint32_t *qmul;
5896         int dqp;
5897
5898         if(IS_INTERLACED(mb_type)){
5899             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5900             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5901             dc_scan= luma_dc_field_scan;
5902         }else{
5903             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5904             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5905             dc_scan= luma_dc_zigzag_scan;
5906         }
5907
5908         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5909         if( dqp == INT_MIN ){
5910             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5911             return -1;
5912         }
5913         s->qscale += dqp;
5914         if(((unsigned)s->qscale) > 51){
5915             if(s->qscale<0) s->qscale+= 52;
5916             else            s->qscale-= 52;
5917         }
5918         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5919         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5920
5921         if( IS_INTRA16x16( mb_type ) ) {
5922             int i;
5923             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5924             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5925
5926             if( cbp&15 ) {
5927                 qmul = h->dequant4_coeff[0][s->qscale];
5928                 for( i = 0; i < 16; i++ ) {
5929                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5930                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
5931                 }
5932             } else {
5933                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5934             }
5935         } else {
5936             int i8x8, i4x4;
5937             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5938                 if( cbp & (1<<i8x8) ) {
5939                     if( IS_8x8DCT(mb_type) ) {
5940                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
5941                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
5942                     } else {
5943                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
5944                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
5945                             const int index = 4*i8x8 + i4x4;
5946                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
5947 //START_TIMER
5948                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
5949 //STOP_TIMER("decode_residual")
5950                         }
5951                     }
5952                 } else {
5953                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5954                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5955                 }
5956             }
5957         }
5958
5959         if( cbp&0x30 ){
5960             int c;
5961             for( c = 0; c < 2; c++ ) {
5962                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
5963                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
5964             }
5965         }
5966
5967         if( cbp&0x20 ) {
5968             int c, i;
5969             for( c = 0; c < 2; c++ ) {
5970                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
5971                 for( i = 0; i < 4; i++ ) {
5972                     const int index = 16 + 4 * c + i;
5973                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
5974                     decode_cabac_residual(h, h->mb + 16*index, 4, index, scan + 1, qmul, 15);
5975                 }
5976             }
5977         } else {
5978             uint8_t * const nnz= &h->non_zero_count_cache[0];
5979             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5980             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5981         }
5982     } else {
5983         uint8_t * const nnz= &h->non_zero_count_cache[0];
5984         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5985         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5986         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5987         h->last_qscale_diff = 0;
5988     }
5989
5990     s->current_picture.qscale_table[mb_xy]= s->qscale;
5991     write_back_non_zero_count(h);
5992
5993     if(MB_MBAFF){
5994         h->ref_count[0] >>= 1;
5995         h->ref_count[1] >>= 1;
5996     }
5997
5998     return 0;
5999 }
6000
6001
6002 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6003     const int index_a = qp + h->slice_alpha_c0_offset;
6004     const int alpha = (alpha_table+52)[index_a];
6005     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6006
6007     if( bS[0] < 4 ) {
6008         int8_t tc[4];
6009         tc[0] = (tc0_table+52)[index_a][bS[0]];
6010         tc[1] = (tc0_table+52)[index_a][bS[1]];
6011         tc[2] = (tc0_table+52)[index_a][bS[2]];
6012         tc[3] = (tc0_table+52)[index_a][bS[3]];
6013         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
6014     } else {
6015         h->s.dsp.h264_h_loop_filter_luma_intra(pix, stride, alpha, beta);
6016     }
6017 }
6018 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6019     const int index_a = qp + h->slice_alpha_c0_offset;
6020     const int alpha = (alpha_table+52)[index_a];
6021     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6022
6023     if( bS[0] < 4 ) {
6024         int8_t tc[4];
6025         tc[0] = (tc0_table+52)[index_a][bS[0]]+1;
6026         tc[1] = (tc0_table+52)[index_a][bS[1]]+1;
6027         tc[2] = (tc0_table+52)[index_a][bS[2]]+1;
6028         tc[3] = (tc0_table+52)[index_a][bS[3]]+1;
6029         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
6030     } else {
6031         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
6032     }
6033 }
6034
6035 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6036     int i;
6037     for( i = 0; i < 16; i++, pix += stride) {
6038         int index_a;
6039         int alpha;
6040         int beta;
6041
6042         int qp_index;
6043         int bS_index = (i >> 1);
6044         if (!MB_FIELD) {
6045             bS_index &= ~1;
6046             bS_index |= (i & 1);
6047         }
6048
6049         if( bS[bS_index] == 0 ) {
6050             continue;
6051         }
6052
6053         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
6054         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6055         alpha = (alpha_table+52)[index_a];
6056         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6057
6058         if( bS[bS_index] < 4 ) {
6059             const int tc0 = (tc0_table+52)[index_a][bS[bS_index]];
6060             const int p0 = pix[-1];
6061             const int p1 = pix[-2];
6062             const int p2 = pix[-3];
6063             const int q0 = pix[0];
6064             const int q1 = pix[1];
6065             const int q2 = pix[2];
6066
6067             if( FFABS( p0 - q0 ) < alpha &&
6068                 FFABS( p1 - p0 ) < beta &&
6069                 FFABS( q1 - q0 ) < beta ) {
6070                 int tc = tc0;
6071                 int i_delta;
6072
6073                 if( FFABS( p2 - p0 ) < beta ) {
6074                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6075                     tc++;
6076                 }
6077                 if( FFABS( q2 - q0 ) < beta ) {
6078                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6079                     tc++;
6080                 }
6081
6082                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6083                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6084                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6085                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6086             }
6087         }else{
6088             const int p0 = pix[-1];
6089             const int p1 = pix[-2];
6090             const int p2 = pix[-3];
6091
6092             const int q0 = pix[0];
6093             const int q1 = pix[1];
6094             const int q2 = pix[2];
6095
6096             if( FFABS( p0 - q0 ) < alpha &&
6097                 FFABS( p1 - p0 ) < beta &&
6098                 FFABS( q1 - q0 ) < beta ) {
6099
6100                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6101                     if( FFABS( p2 - p0 ) < beta)
6102                     {
6103                         const int p3 = pix[-4];
6104                         /* p0', p1', p2' */
6105                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6106                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6107                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6108                     } else {
6109                         /* p0' */
6110                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6111                     }
6112                     if( FFABS( q2 - q0 ) < beta)
6113                     {
6114                         const int q3 = pix[3];
6115                         /* q0', q1', q2' */
6116                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6117                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6118                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6119                     } else {
6120                         /* q0' */
6121                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6122                     }
6123                 }else{
6124                     /* p0', q0' */
6125                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6126                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6127                 }
6128                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6129             }
6130         }
6131     }
6132 }
6133 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6134     int i;
6135     for( i = 0; i < 8; i++, pix += stride) {
6136         int index_a;
6137         int alpha;
6138         int beta;
6139
6140         int qp_index;
6141         int bS_index = i;
6142
6143         if( bS[bS_index] == 0 ) {
6144             continue;
6145         }
6146
6147         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6148         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6149         alpha = (alpha_table+52)[index_a];
6150         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6151
6152         if( bS[bS_index] < 4 ) {
6153             const int tc = (tc0_table+52)[index_a][bS[bS_index]] + 1;
6154             const int p0 = pix[-1];
6155             const int p1 = pix[-2];
6156             const int q0 = pix[0];
6157             const int q1 = pix[1];
6158
6159             if( FFABS( p0 - q0 ) < alpha &&
6160                 FFABS( p1 - p0 ) < beta &&
6161                 FFABS( q1 - q0 ) < beta ) {
6162                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6163
6164                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6165                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6166                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6167             }
6168         }else{
6169             const int p0 = pix[-1];
6170             const int p1 = pix[-2];
6171             const int q0 = pix[0];
6172             const int q1 = pix[1];
6173
6174             if( FFABS( p0 - q0 ) < alpha &&
6175                 FFABS( p1 - p0 ) < beta &&
6176                 FFABS( q1 - q0 ) < beta ) {
6177
6178                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6179                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6180                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6181             }
6182         }
6183     }
6184 }
6185
6186 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6187     const int index_a = qp + h->slice_alpha_c0_offset;
6188     const int alpha = (alpha_table+52)[index_a];
6189     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6190
6191     if( bS[0] < 4 ) {
6192         int8_t tc[4];
6193         tc[0] = (tc0_table+52)[index_a][bS[0]];
6194         tc[1] = (tc0_table+52)[index_a][bS[1]];
6195         tc[2] = (tc0_table+52)[index_a][bS[2]];
6196         tc[3] = (tc0_table+52)[index_a][bS[3]];
6197         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6198     } else {
6199         h->s.dsp.h264_v_loop_filter_luma_intra(pix, stride, alpha, beta);
6200     }
6201 }
6202
6203 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6204     const int index_a = qp + h->slice_alpha_c0_offset;
6205     const int alpha = (alpha_table+52)[index_a];
6206     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6207
6208     if( bS[0] < 4 ) {
6209         int8_t tc[4];
6210         tc[0] = (tc0_table+52)[index_a][bS[0]]+1;
6211         tc[1] = (tc0_table+52)[index_a][bS[1]]+1;
6212         tc[2] = (tc0_table+52)[index_a][bS[2]]+1;
6213         tc[3] = (tc0_table+52)[index_a][bS[3]]+1;
6214         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6215     } else {
6216         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6217     }
6218 }
6219
6220 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6221     MpegEncContext * const s = &h->s;
6222     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6223     int mb_xy, mb_type;
6224     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6225
6226     mb_xy = h->mb_xy;
6227
6228     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6229         !(s->flags2 & CODEC_FLAG2_FAST) || //FIXME filter_mb_fast is broken, thus hasto be, but should not under CODEC_FLAG2_FAST
6230        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6231                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6232         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6233         return;
6234     }
6235     assert(!FRAME_MBAFF);
6236
6237     mb_type = s->current_picture.mb_type[mb_xy];
6238     qp = s->current_picture.qscale_table[mb_xy];
6239     qp0 = s->current_picture.qscale_table[mb_xy-1];
6240     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6241     qpc = get_chroma_qp( h, 0, qp );
6242     qpc0 = get_chroma_qp( h, 0, qp0 );
6243     qpc1 = get_chroma_qp( h, 0, qp1 );
6244     qp0 = (qp + qp0 + 1) >> 1;
6245     qp1 = (qp + qp1 + 1) >> 1;
6246     qpc0 = (qpc + qpc0 + 1) >> 1;
6247     qpc1 = (qpc + qpc1 + 1) >> 1;
6248     qp_thresh = 15 - h->slice_alpha_c0_offset;
6249     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6250        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6251         return;
6252
6253     if( IS_INTRA(mb_type) ) {
6254         int16_t bS4[4] = {4,4,4,4};
6255         int16_t bS3[4] = {3,3,3,3};
6256         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6257         if( IS_8x8DCT(mb_type) ) {
6258             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6259             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6260             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6261             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6262         } else {
6263             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6264             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6265             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6266             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6267             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6268             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6269             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6270             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6271         }
6272         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6273         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6274         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6275         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6276         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6277         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6278         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6279         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6280         return;
6281     } else {
6282         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6283         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6284         int edges;
6285         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6286             edges = 4;
6287             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6288         } else {
6289             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6290                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6291             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6292                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6293                              ? 3 : 0;
6294             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6295             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6296             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6297                                               (h->slice_type_nos == FF_B_TYPE), edges, step, mask_edge0, mask_edge1, FIELD_PICTURE);
6298         }
6299         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6300             bSv[0][0] = 0x0004000400040004ULL;
6301         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6302             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6303
6304 #define FILTER(hv,dir,edge)\
6305         if(bSv[dir][edge]) {\
6306             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6307             if(!(edge&1)) {\
6308                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6309                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6310             }\
6311         }
6312         if( edges == 1 ) {
6313             FILTER(v,0,0);
6314             FILTER(h,1,0);
6315         } else if( IS_8x8DCT(mb_type) ) {
6316             FILTER(v,0,0);
6317             FILTER(v,0,2);
6318             FILTER(h,1,0);
6319             FILTER(h,1,2);
6320         } else {
6321             FILTER(v,0,0);
6322             FILTER(v,0,1);
6323             FILTER(v,0,2);
6324             FILTER(v,0,3);
6325             FILTER(h,1,0);
6326             FILTER(h,1,1);
6327             FILTER(h,1,2);
6328             FILTER(h,1,3);
6329         }
6330 #undef FILTER
6331     }
6332 }
6333
6334
6335 static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int mb_xy, int mb_type, int mvy_limit, int first_vertical_edge_done, int dir) {
6336     MpegEncContext * const s = &h->s;
6337     int edge;
6338     const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6339     const int mbm_type = s->current_picture.mb_type[mbm_xy];
6340     int (*ref2frm) [64] = h->ref2frm[ h->slice_num          &(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6341     int (*ref2frmm)[64] = h->ref2frm[ h->slice_table[mbm_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6342     int start = h->slice_table[mbm_xy] == 0xFFFF ? 1 : 0;
6343
6344     const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6345                               == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6346     // how often to recheck mv-based bS when iterating between edges
6347     const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6348                           (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6349     // how often to recheck mv-based bS when iterating along each edge
6350     const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6351
6352     if (first_vertical_edge_done) {
6353         start = 1;
6354     }
6355
6356     if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6357         start = 1;
6358
6359     if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6360         && !IS_INTERLACED(mb_type)
6361         && IS_INTERLACED(mbm_type)
6362         ) {
6363         // This is a special case in the norm where the filtering must
6364         // be done twice (one each of the field) even if we are in a
6365         // frame macroblock.
6366         //
6367         static const int nnz_idx[4] = {4,5,6,3};
6368         unsigned int tmp_linesize   = 2 *   linesize;
6369         unsigned int tmp_uvlinesize = 2 * uvlinesize;
6370         int mbn_xy = mb_xy - 2 * s->mb_stride;
6371         int qp;
6372         int i, j;
6373         int16_t bS[4];
6374
6375         for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6376             if( IS_INTRA(mb_type) ||
6377                 IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6378                 bS[0] = bS[1] = bS[2] = bS[3] = 3;
6379             } else {
6380                 const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6381                 for( i = 0; i < 4; i++ ) {
6382                     if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6383                         mbn_nnz[nnz_idx[i]] != 0 )
6384                         bS[i] = 2;
6385                     else
6386                         bS[i] = 1;
6387                 }
6388             }
6389             // Do not use s->qscale as luma quantizer because it has not the same
6390             // value in IPCM macroblocks.
6391             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6392             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6393             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6394             filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6395             filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6396                               ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6397             filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6398                               ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6399         }
6400
6401         start = 1;
6402     }
6403
6404     /* Calculate bS */
6405     for( edge = start; edge < edges; edge++ ) {
6406         /* mbn_xy: neighbor macroblock */
6407         const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6408         const int mbn_type = s->current_picture.mb_type[mbn_xy];
6409         int (*ref2frmn)[64] = edge > 0 ? ref2frm : ref2frmm;
6410         int16_t bS[4];
6411         int qp;
6412
6413         if( (edge&1) && IS_8x8DCT(mb_type) )
6414             continue;
6415
6416         if( IS_INTRA(mb_type) ||
6417             IS_INTRA(mbn_type) ) {
6418             int value;
6419             if (edge == 0) {
6420                 if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6421                     || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6422                 ) {
6423                     value = 4;
6424                 } else {
6425                     value = 3;
6426                 }
6427             } else {
6428                 value = 3;
6429             }
6430             bS[0] = bS[1] = bS[2] = bS[3] = value;
6431         } else {
6432             int i, l;
6433             int mv_done;
6434
6435             if( edge & mask_edge ) {
6436                 bS[0] = bS[1] = bS[2] = bS[3] = 0;
6437                 mv_done = 1;
6438             }
6439             else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6440                 bS[0] = bS[1] = bS[2] = bS[3] = 1;
6441                 mv_done = 1;
6442             }
6443             else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6444                 int b_idx= 8 + 4 + edge * (dir ? 8:1);
6445                 int bn_idx= b_idx - (dir ? 8:1);
6446                 int v = 0;
6447
6448                 for( l = 0; !v && l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6449                     v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6450                          FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6451                          FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6452                 }
6453
6454                 if(h->slice_type_nos == FF_B_TYPE && v){
6455                     v=0;
6456                     for( l = 0; !v && l < 2; l++ ) {
6457                         int ln= 1-l;
6458                         v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6459                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6460                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit;
6461                     }
6462                 }
6463
6464                 bS[0] = bS[1] = bS[2] = bS[3] = v;
6465                 mv_done = 1;
6466             }
6467             else
6468                 mv_done = 0;
6469
6470             for( i = 0; i < 4; i++ ) {
6471                 int x = dir == 0 ? edge : i;
6472                 int y = dir == 0 ? i    : edge;
6473                 int b_idx= 8 + 4 + x + 8*y;
6474                 int bn_idx= b_idx - (dir ? 8:1);
6475
6476                 if( h->non_zero_count_cache[b_idx] |
6477                     h->non_zero_count_cache[bn_idx] ) {
6478                     bS[i] = 2;
6479                 }
6480                 else if(!mv_done)
6481                 {
6482                     bS[i] = 0;
6483                     for( l = 0; l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6484                         if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6485                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6486                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6487                             bS[i] = 1;
6488                             break;
6489                         }
6490                     }
6491
6492                     if(h->slice_type_nos == FF_B_TYPE && bS[i]){
6493                         bS[i] = 0;
6494                         for( l = 0; l < 2; l++ ) {
6495                             int ln= 1-l;
6496                             if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6497                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6498                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) {
6499                                 bS[i] = 1;
6500                                 break;
6501                             }
6502                         }
6503                     }
6504                 }
6505             }
6506
6507             if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6508                 continue;
6509         }
6510
6511         /* Filter edge */
6512         // Do not use s->qscale as luma quantizer because it has not the same
6513         // value in IPCM macroblocks.
6514         qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6515         //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6516         tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6517         { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6518         if( dir == 0 ) {
6519             filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6520             if( (edge&1) == 0 ) {
6521                 filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6522                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6523                 filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6524                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6525             }
6526         } else {
6527             filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6528             if( (edge&1) == 0 ) {
6529                 filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6530                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6531                 filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6532                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6533             }
6534         }
6535     }
6536 }
6537
6538 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6539     MpegEncContext * const s = &h->s;
6540     const int mb_xy= mb_x + mb_y*s->mb_stride;
6541     const int mb_type = s->current_picture.mb_type[mb_xy];
6542     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6543     int first_vertical_edge_done = 0;
6544     av_unused int dir;
6545
6546     //for sufficiently low qp, filtering wouldn't do anything
6547     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6548     if(!FRAME_MBAFF){
6549         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
6550         int qp = s->current_picture.qscale_table[mb_xy];
6551         if(qp <= qp_thresh
6552            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6553            && (h->top_mb_xy < 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6554             return;
6555         }
6556     }
6557
6558     // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
6559     if(!h->pps.cabac && h->pps.transform_8x8_mode){
6560         int top_type, left_type[2];
6561         top_type     = s->current_picture.mb_type[h->top_mb_xy]    ;
6562         left_type[0] = s->current_picture.mb_type[h->left_mb_xy[0]];
6563         left_type[1] = s->current_picture.mb_type[h->left_mb_xy[1]];
6564
6565         if(IS_8x8DCT(top_type)){
6566             h->non_zero_count_cache[4+8*0]=
6567             h->non_zero_count_cache[5+8*0]= h->cbp_table[h->top_mb_xy] & 4;
6568             h->non_zero_count_cache[6+8*0]=
6569             h->non_zero_count_cache[7+8*0]= h->cbp_table[h->top_mb_xy] & 8;
6570         }
6571         if(IS_8x8DCT(left_type[0])){
6572             h->non_zero_count_cache[3+8*1]=
6573             h->non_zero_count_cache[3+8*2]= h->cbp_table[h->left_mb_xy[0]]&2; //FIXME check MBAFF
6574         }
6575         if(IS_8x8DCT(left_type[1])){
6576             h->non_zero_count_cache[3+8*3]=
6577             h->non_zero_count_cache[3+8*4]= h->cbp_table[h->left_mb_xy[1]]&8; //FIXME check MBAFF
6578         }
6579
6580         if(IS_8x8DCT(mb_type)){
6581             h->non_zero_count_cache[scan8[0   ]]= h->non_zero_count_cache[scan8[1   ]]=
6582             h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp & 1;
6583
6584             h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
6585             h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp & 2;
6586
6587             h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
6588             h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp & 4;
6589
6590             h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
6591             h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp & 8;
6592         }
6593     }
6594
6595     if (FRAME_MBAFF
6596             // left mb is in picture
6597             && h->slice_table[mb_xy-1] != 0xFFFF
6598             // and current and left pair do not have the same interlaced type
6599             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6600             // and left mb is in the same slice if deblocking_filter == 2
6601             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6602         /* First vertical edge is different in MBAFF frames
6603          * There are 8 different bS to compute and 2 different Qp
6604          */
6605         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6606         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6607         int16_t bS[8];
6608         int qp[2];
6609         int bqp[2];
6610         int rqp[2];
6611         int mb_qp, mbn0_qp, mbn1_qp;
6612         int i;
6613         first_vertical_edge_done = 1;
6614
6615         if( IS_INTRA(mb_type) )
6616             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6617         else {
6618             for( i = 0; i < 8; i++ ) {
6619                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6620
6621                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6622                     bS[i] = 4;
6623                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6624                          ((!h->pps.cabac && IS_8x8DCT(s->current_picture.mb_type[mbn_xy])) ?
6625                             (h->cbp_table[mbn_xy] & ((MB_FIELD ? (i&2) : (mb_y&1)) ? 8 : 2))
6626                                                                        :
6627                             h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2]))
6628                     bS[i] = 2;
6629                 else
6630                     bS[i] = 1;
6631             }
6632         }
6633
6634         mb_qp = s->current_picture.qscale_table[mb_xy];
6635         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6636         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6637         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6638         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6639                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6640         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6641                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6642         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6643         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6644                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6645         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6646                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6647
6648         /* Filter edge */
6649         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6650         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6651         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6652         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6653         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6654     }
6655
6656 #if CONFIG_SMALL
6657     for( dir = 0; dir < 2; dir++ )
6658         filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, dir ? 0 : first_vertical_edge_done, dir);
6659 #else
6660     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, first_vertical_edge_done, 0);
6661     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, 0, 1);
6662 #endif
6663 }
6664
6665 static int decode_slice(struct AVCodecContext *avctx, void *arg){
6666     H264Context *h = *(void**)arg;
6667     MpegEncContext * const s = &h->s;
6668     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6669
6670     s->mb_skip_run= -1;
6671
6672     h->is_complex = FRAME_MBAFF || s->picture_structure != PICT_FRAME || s->codec_id != CODEC_ID_H264 ||
6673                     (CONFIG_GRAY && (s->flags&CODEC_FLAG_GRAY));
6674
6675     if( h->pps.cabac ) {
6676         int i;
6677
6678         /* realign */
6679         align_get_bits( &s->gb );
6680
6681         /* init cabac */
6682         ff_init_cabac_states( &h->cabac);
6683         ff_init_cabac_decoder( &h->cabac,
6684                                s->gb.buffer + get_bits_count(&s->gb)/8,
6685                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6686         /* calculate pre-state */
6687         for( i= 0; i < 460; i++ ) {
6688             int pre;
6689             if( h->slice_type_nos == FF_I_TYPE )
6690                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6691             else
6692                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6693
6694             if( pre <= 63 )
6695                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6696             else
6697                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6698         }
6699
6700         for(;;){
6701 //START_TIMER
6702             int ret = decode_mb_cabac(h);
6703             int eos;
6704 //STOP_TIMER("decode_mb_cabac")
6705
6706             if(ret>=0) hl_decode_mb(h);
6707
6708             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6709                 s->mb_y++;
6710
6711                 ret = decode_mb_cabac(h);
6712
6713                 if(ret>=0) hl_decode_mb(h);
6714                 s->mb_y--;
6715             }
6716             eos = get_cabac_terminate( &h->cabac );
6717
6718             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6719                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6720                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6721                 return -1;
6722             }
6723
6724             if( ++s->mb_x >= s->mb_width ) {
6725                 s->mb_x = 0;
6726                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6727                 ++s->mb_y;
6728                 if(FIELD_OR_MBAFF_PICTURE) {
6729                     ++s->mb_y;
6730                 }
6731             }
6732
6733             if( eos || s->mb_y >= s->mb_height ) {
6734                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6735                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6736                 return 0;
6737             }
6738         }
6739
6740     } else {
6741         for(;;){
6742             int ret = decode_mb_cavlc(h);
6743
6744             if(ret>=0) hl_decode_mb(h);
6745
6746             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6747                 s->mb_y++;
6748                 ret = decode_mb_cavlc(h);
6749
6750                 if(ret>=0) hl_decode_mb(h);
6751                 s->mb_y--;
6752             }
6753
6754             if(ret<0){
6755                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6756                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6757
6758                 return -1;
6759             }
6760
6761             if(++s->mb_x >= s->mb_width){
6762                 s->mb_x=0;
6763                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6764                 ++s->mb_y;
6765                 if(FIELD_OR_MBAFF_PICTURE) {
6766                     ++s->mb_y;
6767                 }
6768                 if(s->mb_y >= s->mb_height){
6769                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6770
6771                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6772                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6773
6774                         return 0;
6775                     }else{
6776                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6777
6778                         return -1;
6779                     }
6780                 }
6781             }
6782
6783             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6784                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6785                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6786                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6787
6788                     return 0;
6789                 }else{
6790                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6791
6792                     return -1;
6793                 }
6794             }
6795         }
6796     }
6797
6798 #if 0
6799     for(;s->mb_y < s->mb_height; s->mb_y++){
6800         for(;s->mb_x < s->mb_width; s->mb_x++){
6801             int ret= decode_mb(h);
6802
6803             hl_decode_mb(h);
6804
6805             if(ret<0){
6806                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6807                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6808
6809                 return -1;
6810             }
6811
6812             if(++s->mb_x >= s->mb_width){
6813                 s->mb_x=0;
6814                 if(++s->mb_y >= s->mb_height){
6815                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6816                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6817
6818                         return 0;
6819                     }else{
6820                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6821
6822                         return -1;
6823                     }
6824                 }
6825             }
6826
6827             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6828                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6829                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6830
6831                     return 0;
6832                 }else{
6833                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6834
6835                     return -1;
6836                 }
6837             }
6838         }
6839         s->mb_x=0;
6840         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6841     }
6842 #endif
6843     return -1; //not reached
6844 }
6845
6846 static int decode_picture_timing(H264Context *h){
6847     MpegEncContext * const s = &h->s;
6848     if(h->sps.nal_hrd_parameters_present_flag || h->sps.vcl_hrd_parameters_present_flag){
6849         h->sei_cpb_removal_delay = get_bits(&s->gb, h->sps.cpb_removal_delay_length);
6850         h->sei_dpb_output_delay = get_bits(&s->gb, h->sps.dpb_output_delay_length);
6851     }
6852     if(h->sps.pic_struct_present_flag){
6853         unsigned int i, num_clock_ts;
6854         h->sei_pic_struct = get_bits(&s->gb, 4);
6855         h->sei_ct_type    = 0;
6856
6857         if (h->sei_pic_struct > SEI_PIC_STRUCT_FRAME_TRIPLING)
6858             return -1;
6859
6860         num_clock_ts = sei_num_clock_ts_table[h->sei_pic_struct];
6861
6862         for (i = 0 ; i < num_clock_ts ; i++){
6863             if(get_bits(&s->gb, 1)){                  /* clock_timestamp_flag */
6864                 unsigned int full_timestamp_flag;
6865                 h->sei_ct_type |= 1<<get_bits(&s->gb, 2);
6866                 skip_bits(&s->gb, 1);                 /* nuit_field_based_flag */
6867                 skip_bits(&s->gb, 5);                 /* counting_type */
6868                 full_timestamp_flag = get_bits(&s->gb, 1);
6869                 skip_bits(&s->gb, 1);                 /* discontinuity_flag */
6870                 skip_bits(&s->gb, 1);                 /* cnt_dropped_flag */
6871                 skip_bits(&s->gb, 8);                 /* n_frames */
6872                 if(full_timestamp_flag){
6873                     skip_bits(&s->gb, 6);             /* seconds_value 0..59 */
6874                     skip_bits(&s->gb, 6);             /* minutes_value 0..59 */
6875                     skip_bits(&s->gb, 5);             /* hours_value 0..23 */
6876                 }else{
6877                     if(get_bits(&s->gb, 1)){          /* seconds_flag */
6878                         skip_bits(&s->gb, 6);         /* seconds_value range 0..59 */
6879                         if(get_bits(&s->gb, 1)){      /* minutes_flag */
6880                             skip_bits(&s->gb, 6);     /* minutes_value 0..59 */
6881                             if(get_bits(&s->gb, 1))   /* hours_flag */
6882                                 skip_bits(&s->gb, 5); /* hours_value 0..23 */
6883                         }
6884                     }
6885                 }
6886                 if(h->sps.time_offset_length > 0)
6887                     skip_bits(&s->gb, h->sps.time_offset_length); /* time_offset */
6888             }
6889         }
6890     }
6891     return 0;
6892 }
6893
6894 static int decode_unregistered_user_data(H264Context *h, int size){
6895     MpegEncContext * const s = &h->s;
6896     uint8_t user_data[16+256];
6897     int e, build, i;
6898
6899     if(size<16)
6900         return -1;
6901
6902     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6903         user_data[i]= get_bits(&s->gb, 8);
6904     }
6905
6906     user_data[i]= 0;
6907     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6908     if(e==1 && build>=0)
6909         h->x264_build= build;
6910
6911     if(s->avctx->debug & FF_DEBUG_BUGS)
6912         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6913
6914     for(; i<size; i++)
6915         skip_bits(&s->gb, 8);
6916
6917     return 0;
6918 }
6919
6920 static int decode_recovery_point(H264Context *h){
6921     MpegEncContext * const s = &h->s;
6922
6923     h->sei_recovery_frame_cnt = get_ue_golomb(&s->gb);
6924     skip_bits(&s->gb, 4);       /* 1b exact_match_flag, 1b broken_link_flag, 2b changing_slice_group_idc */
6925
6926     return 0;
6927 }
6928
6929 static int decode_buffering_period(H264Context *h){
6930     MpegEncContext * const s = &h->s;
6931     unsigned int sps_id;
6932     int sched_sel_idx;
6933     SPS *sps;
6934
6935     sps_id = get_ue_golomb_31(&s->gb);
6936     if(sps_id > 31 || !h->sps_buffers[sps_id]) {
6937         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS %d referenced in buffering period\n", sps_id);
6938         return -1;
6939     }
6940     sps = h->sps_buffers[sps_id];
6941
6942     // NOTE: This is really so duplicated in the standard... See H.264, D.1.1
6943     if (sps->nal_hrd_parameters_present_flag) {
6944         for (sched_sel_idx = 0; sched_sel_idx < sps->cpb_cnt; sched_sel_idx++) {
6945             h->initial_cpb_removal_delay[sched_sel_idx] = get_bits(&s->gb, sps->initial_cpb_removal_delay_length);
6946             skip_bits(&s->gb, sps->initial_cpb_removal_delay_length); // initial_cpb_removal_delay_offset
6947         }
6948     }
6949     if (sps->vcl_hrd_parameters_present_flag) {
6950         for (sched_sel_idx = 0; sched_sel_idx < sps->cpb_cnt; sched_sel_idx++) {
6951             h->initial_cpb_removal_delay[sched_sel_idx] = get_bits(&s->gb, sps->initial_cpb_removal_delay_length);
6952             skip_bits(&s->gb, sps->initial_cpb_removal_delay_length); // initial_cpb_removal_delay_offset
6953         }
6954     }
6955
6956     h->sei_buffering_period_present = 1;
6957     return 0;
6958 }
6959
6960 int ff_h264_decode_sei(H264Context *h){
6961     MpegEncContext * const s = &h->s;
6962
6963     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6964         int size, type;
6965
6966         type=0;
6967         do{
6968             type+= show_bits(&s->gb, 8);
6969         }while(get_bits(&s->gb, 8) == 255);
6970
6971         size=0;
6972         do{
6973             size+= show_bits(&s->gb, 8);
6974         }while(get_bits(&s->gb, 8) == 255);
6975
6976         switch(type){
6977         case SEI_TYPE_PIC_TIMING: // Picture timing SEI
6978             if(decode_picture_timing(h) < 0)
6979                 return -1;
6980             break;
6981         case SEI_TYPE_USER_DATA_UNREGISTERED:
6982             if(decode_unregistered_user_data(h, size) < 0)
6983                 return -1;
6984             break;
6985         case SEI_TYPE_RECOVERY_POINT:
6986             if(decode_recovery_point(h) < 0)
6987                 return -1;
6988             break;
6989         case SEI_BUFFERING_PERIOD:
6990             if(decode_buffering_period(h) < 0)
6991                 return -1;
6992             break;
6993         default:
6994             skip_bits(&s->gb, 8*size);
6995         }
6996
6997         //FIXME check bits here
6998         align_get_bits(&s->gb);
6999     }
7000
7001     return 0;
7002 }
7003
7004 static inline int decode_hrd_parameters(H264Context *h, SPS *sps){
7005     MpegEncContext * const s = &h->s;
7006     int cpb_count, i;
7007     cpb_count = get_ue_golomb_31(&s->gb) + 1;
7008
7009     if(cpb_count > 32U){
7010         av_log(h->s.avctx, AV_LOG_ERROR, "cpb_count %d invalid\n", cpb_count);
7011         return -1;
7012     }
7013
7014     get_bits(&s->gb, 4); /* bit_rate_scale */
7015     get_bits(&s->gb, 4); /* cpb_size_scale */
7016     for(i=0; i<cpb_count; i++){
7017         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
7018         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
7019         get_bits1(&s->gb);     /* cbr_flag */
7020     }
7021     sps->initial_cpb_removal_delay_length = get_bits(&s->gb, 5) + 1;
7022     sps->cpb_removal_delay_length = get_bits(&s->gb, 5) + 1;
7023     sps->dpb_output_delay_length = get_bits(&s->gb, 5) + 1;
7024     sps->time_offset_length = get_bits(&s->gb, 5);
7025     sps->cpb_cnt = cpb_count;
7026     return 0;
7027 }
7028
7029 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
7030     MpegEncContext * const s = &h->s;
7031     int aspect_ratio_info_present_flag;
7032     unsigned int aspect_ratio_idc;
7033
7034     aspect_ratio_info_present_flag= get_bits1(&s->gb);
7035
7036     if( aspect_ratio_info_present_flag ) {
7037         aspect_ratio_idc= get_bits(&s->gb, 8);
7038         if( aspect_ratio_idc == EXTENDED_SAR ) {
7039             sps->sar.num= get_bits(&s->gb, 16);
7040             sps->sar.den= get_bits(&s->gb, 16);
7041         }else if(aspect_ratio_idc < FF_ARRAY_ELEMS(pixel_aspect)){
7042             sps->sar=  pixel_aspect[aspect_ratio_idc];
7043         }else{
7044             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
7045             return -1;
7046         }
7047     }else{
7048         sps->sar.num=
7049         sps->sar.den= 0;
7050     }
7051 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
7052
7053     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
7054         get_bits1(&s->gb);      /* overscan_appropriate_flag */
7055     }
7056
7057     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
7058         get_bits(&s->gb, 3);    /* video_format */
7059         get_bits1(&s->gb);      /* video_full_range_flag */
7060         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
7061             get_bits(&s->gb, 8); /* colour_primaries */
7062             get_bits(&s->gb, 8); /* transfer_characteristics */
7063             get_bits(&s->gb, 8); /* matrix_coefficients */
7064         }
7065     }
7066
7067     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
7068         s->avctx->chroma_sample_location = get_ue_golomb(&s->gb)+1;  /* chroma_sample_location_type_top_field */
7069         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
7070     }
7071
7072     sps->timing_info_present_flag = get_bits1(&s->gb);
7073     if(sps->timing_info_present_flag){
7074         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
7075         sps->time_scale = get_bits_long(&s->gb, 32);
7076         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
7077     }
7078
7079     sps->nal_hrd_parameters_present_flag = get_bits1(&s->gb);
7080     if(sps->nal_hrd_parameters_present_flag)
7081         if(decode_hrd_parameters(h, sps) < 0)
7082             return -1;
7083     sps->vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
7084     if(sps->vcl_hrd_parameters_present_flag)
7085         if(decode_hrd_parameters(h, sps) < 0)
7086             return -1;
7087     if(sps->nal_hrd_parameters_present_flag || sps->vcl_hrd_parameters_present_flag)
7088         get_bits1(&s->gb);     /* low_delay_hrd_flag */
7089     sps->pic_struct_present_flag = get_bits1(&s->gb);
7090
7091     sps->bitstream_restriction_flag = get_bits1(&s->gb);
7092     if(sps->bitstream_restriction_flag){
7093         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
7094         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
7095         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
7096         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
7097         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
7098         sps->num_reorder_frames= get_ue_golomb(&s->gb);
7099         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
7100
7101         if(sps->num_reorder_frames > 16U /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
7102             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", sps->num_reorder_frames);
7103             return -1;
7104         }
7105     }
7106
7107     return 0;
7108 }
7109
7110 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
7111                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
7112     MpegEncContext * const s = &h->s;
7113     int i, last = 8, next = 8;
7114     const uint8_t *scan = size == 16 ? zigzag_scan : ff_zigzag_direct;
7115     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
7116         memcpy(factors, fallback_list, size*sizeof(uint8_t));
7117     else
7118     for(i=0;i<size;i++){
7119         if(next)
7120             next = (last + get_se_golomb(&s->gb)) & 0xff;
7121         if(!i && !next){ /* matrix not written, we use the preset one */
7122             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7123             break;
7124         }
7125         last = factors[scan[i]] = next ? next : last;
7126     }
7127 }
7128
7129 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7130                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7131     MpegEncContext * const s = &h->s;
7132     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7133     const uint8_t *fallback[4] = {
7134         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7135         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7136         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7137         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7138     };
7139     if(get_bits1(&s->gb)){
7140         sps->scaling_matrix_present |= is_sps;
7141         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7142         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7143         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7144         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7145         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7146         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7147         if(is_sps || pps->transform_8x8_mode){
7148             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7149             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7150         }
7151     }
7152 }
7153
7154 int ff_h264_decode_seq_parameter_set(H264Context *h){
7155     MpegEncContext * const s = &h->s;
7156     int profile_idc, level_idc;
7157     unsigned int sps_id;
7158     int i;
7159     SPS *sps;
7160
7161     profile_idc= get_bits(&s->gb, 8);
7162     get_bits1(&s->gb);   //constraint_set0_flag
7163     get_bits1(&s->gb);   //constraint_set1_flag
7164     get_bits1(&s->gb);   //constraint_set2_flag
7165     get_bits1(&s->gb);   //constraint_set3_flag
7166     get_bits(&s->gb, 4); // reserved
7167     level_idc= get_bits(&s->gb, 8);
7168     sps_id= get_ue_golomb_31(&s->gb);
7169
7170     if(sps_id >= MAX_SPS_COUNT) {
7171         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id (%d) out of range\n", sps_id);
7172         return -1;
7173     }
7174     sps= av_mallocz(sizeof(SPS));
7175     if(sps == NULL)
7176         return -1;
7177
7178     sps->profile_idc= profile_idc;
7179     sps->level_idc= level_idc;
7180
7181     memset(sps->scaling_matrix4, 16, sizeof(sps->scaling_matrix4));
7182     memset(sps->scaling_matrix8, 16, sizeof(sps->scaling_matrix8));
7183     sps->scaling_matrix_present = 0;
7184
7185     if(sps->profile_idc >= 100){ //high profile
7186         sps->chroma_format_idc= get_ue_golomb_31(&s->gb);
7187         if(sps->chroma_format_idc == 3)
7188             sps->residual_color_transform_flag = get_bits1(&s->gb);
7189         sps->bit_depth_luma   = get_ue_golomb(&s->gb) + 8;
7190         sps->bit_depth_chroma = get_ue_golomb(&s->gb) + 8;
7191         sps->transform_bypass = get_bits1(&s->gb);
7192         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7193     }else{
7194         sps->chroma_format_idc= 1;
7195     }
7196
7197     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7198     sps->poc_type= get_ue_golomb_31(&s->gb);
7199
7200     if(sps->poc_type == 0){ //FIXME #define
7201         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7202     } else if(sps->poc_type == 1){//FIXME #define
7203         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7204         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7205         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7206         sps->poc_cycle_length                = get_ue_golomb(&s->gb);
7207
7208         if((unsigned)sps->poc_cycle_length >= FF_ARRAY_ELEMS(sps->offset_for_ref_frame)){
7209             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", sps->poc_cycle_length);
7210             goto fail;
7211         }
7212
7213         for(i=0; i<sps->poc_cycle_length; i++)
7214             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7215     }else if(sps->poc_type != 2){
7216         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7217         goto fail;
7218     }
7219
7220     sps->ref_frame_count= get_ue_golomb_31(&s->gb);
7221     if(sps->ref_frame_count > MAX_PICTURE_COUNT-2 || sps->ref_frame_count >= 32U){
7222         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7223         goto fail;
7224     }
7225     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7226     sps->mb_width = get_ue_golomb(&s->gb) + 1;
7227     sps->mb_height= get_ue_golomb(&s->gb) + 1;
7228     if((unsigned)sps->mb_width >= INT_MAX/16 || (unsigned)sps->mb_height >= INT_MAX/16 ||
7229        avcodec_check_dimensions(NULL, 16*sps->mb_width, 16*sps->mb_height)){
7230         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7231         goto fail;
7232     }
7233
7234     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7235     if(!sps->frame_mbs_only_flag)
7236         sps->mb_aff= get_bits1(&s->gb);
7237     else
7238         sps->mb_aff= 0;
7239
7240     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7241
7242 #ifndef ALLOW_INTERLACE
7243     if(sps->mb_aff)
7244         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7245 #endif
7246     sps->crop= get_bits1(&s->gb);
7247     if(sps->crop){
7248         sps->crop_left  = get_ue_golomb(&s->gb);
7249         sps->crop_right = get_ue_golomb(&s->gb);
7250         sps->crop_top   = get_ue_golomb(&s->gb);
7251         sps->crop_bottom= get_ue_golomb(&s->gb);
7252         if(sps->crop_left || sps->crop_top){
7253             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7254         }
7255         if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !sps->frame_mbs_only_flag)){
7256             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
7257         }
7258     }else{
7259         sps->crop_left  =
7260         sps->crop_right =
7261         sps->crop_top   =
7262         sps->crop_bottom= 0;
7263     }
7264
7265     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7266     if( sps->vui_parameters_present_flag )
7267         decode_vui_parameters(h, sps);
7268
7269     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7270         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s %s %d/%d\n",
7271                sps_id, sps->profile_idc, sps->level_idc,
7272                sps->poc_type,
7273                sps->ref_frame_count,
7274                sps->mb_width, sps->mb_height,
7275                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7276                sps->direct_8x8_inference_flag ? "8B8" : "",
7277                sps->crop_left, sps->crop_right,
7278                sps->crop_top, sps->crop_bottom,
7279                sps->vui_parameters_present_flag ? "VUI" : "",
7280                ((const char*[]){"Gray","420","422","444"})[sps->chroma_format_idc],
7281                sps->timing_info_present_flag ? sps->num_units_in_tick : 0,
7282                sps->timing_info_present_flag ? sps->time_scale : 0
7283                );
7284     }
7285
7286     av_free(h->sps_buffers[sps_id]);
7287     h->sps_buffers[sps_id]= sps;
7288     h->sps = *sps;
7289     return 0;
7290 fail:
7291     av_free(sps);
7292     return -1;
7293 }
7294
7295 static void
7296 build_qp_table(PPS *pps, int t, int index)
7297 {
7298     int i;
7299     for(i = 0; i < 52; i++)
7300         pps->chroma_qp_table[t][i] = chroma_qp[av_clip(i + index, 0, 51)];
7301 }
7302
7303 int ff_h264_decode_picture_parameter_set(H264Context *h, int bit_length){
7304     MpegEncContext * const s = &h->s;
7305     unsigned int pps_id= get_ue_golomb(&s->gb);
7306     PPS *pps;
7307
7308     if(pps_id >= MAX_PPS_COUNT) {
7309         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id (%d) out of range\n", pps_id);
7310         return -1;
7311     }
7312
7313     pps= av_mallocz(sizeof(PPS));
7314     if(pps == NULL)
7315         return -1;
7316     pps->sps_id= get_ue_golomb_31(&s->gb);
7317     if((unsigned)pps->sps_id>=MAX_SPS_COUNT || h->sps_buffers[pps->sps_id] == NULL){
7318         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7319         goto fail;
7320     }
7321
7322     pps->cabac= get_bits1(&s->gb);
7323     pps->pic_order_present= get_bits1(&s->gb);
7324     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7325     if(pps->slice_group_count > 1 ){
7326         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7327         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7328         switch(pps->mb_slice_group_map_type){
7329         case 0:
7330 #if 0
7331 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7332 |    run_length[ i ]                                |1  |ue(v)   |
7333 #endif
7334             break;
7335         case 2:
7336 #if 0
7337 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7338 |{                                                  |   |        |
7339 |    top_left_mb[ i ]                               |1  |ue(v)   |
7340 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7341 |   }                                               |   |        |
7342 #endif
7343             break;
7344         case 3:
7345         case 4:
7346         case 5:
7347 #if 0
7348 |   slice_group_change_direction_flag               |1  |u(1)    |
7349 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7350 #endif
7351             break;
7352         case 6:
7353 #if 0
7354 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7355 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7356 |)                                                  |   |        |
7357 |    slice_group_id[ i ]                            |1  |u(v)    |
7358 #endif
7359             break;
7360         }
7361     }
7362     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7363     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7364     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7365         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7366         goto fail;
7367     }
7368
7369     pps->weighted_pred= get_bits1(&s->gb);
7370     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7371     pps->init_qp= get_se_golomb(&s->gb) + 26;
7372     pps->init_qs= get_se_golomb(&s->gb) + 26;
7373     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7374     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7375     pps->constrained_intra_pred= get_bits1(&s->gb);
7376     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7377
7378     pps->transform_8x8_mode= 0;
7379     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7380     memcpy(pps->scaling_matrix4, h->sps_buffers[pps->sps_id]->scaling_matrix4, sizeof(pps->scaling_matrix4));
7381     memcpy(pps->scaling_matrix8, h->sps_buffers[pps->sps_id]->scaling_matrix8, sizeof(pps->scaling_matrix8));
7382
7383     if(get_bits_count(&s->gb) < bit_length){
7384         pps->transform_8x8_mode= get_bits1(&s->gb);
7385         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7386         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7387     } else {
7388         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7389     }
7390
7391     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7392     build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7393     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1])
7394         h->pps.chroma_qp_diff= 1;
7395
7396     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7397         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7398                pps_id, pps->sps_id,
7399                pps->cabac ? "CABAC" : "CAVLC",
7400                pps->slice_group_count,
7401                pps->ref_count[0], pps->ref_count[1],
7402                pps->weighted_pred ? "weighted" : "",
7403                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7404                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7405                pps->constrained_intra_pred ? "CONSTR" : "",
7406                pps->redundant_pic_cnt_present ? "REDU" : "",
7407                pps->transform_8x8_mode ? "8x8DCT" : ""
7408                );
7409     }
7410
7411     av_free(h->pps_buffers[pps_id]);
7412     h->pps_buffers[pps_id]= pps;
7413     return 0;
7414 fail:
7415     av_free(pps);
7416     return -1;
7417 }
7418
7419 /**
7420  * Call decode_slice() for each context.
7421  *
7422  * @param h h264 master context
7423  * @param context_count number of contexts to execute
7424  */
7425 static void execute_decode_slices(H264Context *h, int context_count){
7426     MpegEncContext * const s = &h->s;
7427     AVCodecContext * const avctx= s->avctx;
7428     H264Context *hx;
7429     int i;
7430
7431     if (s->avctx->hwaccel)
7432         return;
7433     if(s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
7434         return;
7435     if(context_count == 1) {
7436         decode_slice(avctx, &h);
7437     } else {
7438         for(i = 1; i < context_count; i++) {
7439             hx = h->thread_context[i];
7440             hx->s.error_recognition = avctx->error_recognition;
7441             hx->s.error_count = 0;
7442         }
7443
7444         avctx->execute(avctx, (void *)decode_slice,
7445                        (void **)h->thread_context, NULL, context_count, sizeof(void*));
7446
7447         /* pull back stuff from slices to master context */
7448         hx = h->thread_context[context_count - 1];
7449         s->mb_x = hx->s.mb_x;
7450         s->mb_y = hx->s.mb_y;
7451         s->dropable = hx->s.dropable;
7452         s->picture_structure = hx->s.picture_structure;
7453         for(i = 1; i < context_count; i++)
7454             h->s.error_count += h->thread_context[i]->s.error_count;
7455     }
7456 }
7457
7458
7459 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7460     MpegEncContext * const s = &h->s;
7461     AVCodecContext * const avctx= s->avctx;
7462     int buf_index=0;
7463     H264Context *hx; ///< thread context
7464     int context_count = 0;
7465     int next_avc= h->is_avc ? 0 : buf_size;
7466
7467     h->max_contexts = avctx->thread_count;
7468 #if 0
7469     int i;
7470     for(i=0; i<50; i++){
7471         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7472     }
7473 #endif
7474     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7475         h->current_slice = 0;
7476         if (!s->first_field)
7477             s->current_picture_ptr= NULL;
7478         reset_sei(h);
7479     }
7480
7481     for(;;){
7482         int consumed;
7483         int dst_length;
7484         int bit_length;
7485         const uint8_t *ptr;
7486         int i, nalsize = 0;
7487         int err;
7488
7489         if(buf_index >= next_avc) {
7490             if(buf_index >= buf_size) break;
7491             nalsize = 0;
7492             for(i = 0; i < h->nal_length_size; i++)
7493                 nalsize = (nalsize << 8) | buf[buf_index++];
7494             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7495                 if(nalsize == 1){
7496                     buf_index++;
7497                     continue;
7498                 }else{
7499                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7500                     break;
7501                 }
7502             }
7503             next_avc= buf_index + nalsize;
7504         } else {
7505             // start code prefix search
7506             for(; buf_index + 3 < buf_size; buf_index++){
7507                 // This should always succeed in the first iteration.
7508                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7509                     break;
7510             }
7511
7512             if(buf_index+3 >= buf_size) break;
7513
7514             buf_index+=3;
7515         }
7516
7517         hx = h->thread_context[context_count];
7518
7519         ptr= ff_h264_decode_nal(hx, buf + buf_index, &dst_length, &consumed, next_avc - buf_index);
7520         if (ptr==NULL || dst_length < 0){
7521             return -1;
7522         }
7523         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7524             dst_length--;
7525         bit_length= !dst_length ? 0 : (8*dst_length - ff_h264_decode_rbsp_trailing(h, ptr + dst_length - 1));
7526
7527         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7528             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7529         }
7530
7531         if (h->is_avc && (nalsize != consumed) && nalsize){
7532             int i, debug_level = AV_LOG_DEBUG;
7533             for (i = consumed; i < nalsize; i++)
7534                 if (buf[buf_index+i])
7535                     debug_level = AV_LOG_ERROR;
7536             av_log(h->s.avctx, debug_level, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7537         }
7538
7539         buf_index += consumed;
7540
7541         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7542            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7543             continue;
7544
7545       again:
7546         err = 0;
7547         switch(hx->nal_unit_type){
7548         case NAL_IDR_SLICE:
7549             if (h->nal_unit_type != NAL_IDR_SLICE) {
7550                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7551                 return -1;
7552             }
7553             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7554         case NAL_SLICE:
7555             init_get_bits(&hx->s.gb, ptr, bit_length);
7556             hx->intra_gb_ptr=
7557             hx->inter_gb_ptr= &hx->s.gb;
7558             hx->s.data_partitioning = 0;
7559
7560             if((err = decode_slice_header(hx, h)))
7561                break;
7562
7563             if (s->avctx->hwaccel && h->current_slice == 1) {
7564                 if (s->avctx->hwaccel->start_frame(s->avctx, NULL, 0) < 0)
7565                     return -1;
7566             }
7567
7568             s->current_picture_ptr->key_frame |=
7569                     (hx->nal_unit_type == NAL_IDR_SLICE) ||
7570                     (h->sei_recovery_frame_cnt >= 0);
7571             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7572                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7573                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7574                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7575                && avctx->skip_frame < AVDISCARD_ALL){
7576                 if(avctx->hwaccel) {
7577                     if (avctx->hwaccel->decode_slice(avctx, &buf[buf_index - consumed], consumed) < 0)
7578                         return -1;
7579                 }else
7580                 if(CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU){
7581                     static const uint8_t start_code[] = {0x00, 0x00, 0x01};
7582                     ff_vdpau_add_data_chunk(s, start_code, sizeof(start_code));
7583                     ff_vdpau_add_data_chunk(s, &buf[buf_index - consumed], consumed );
7584                 }else
7585                     context_count++;
7586             }
7587             break;
7588         case NAL_DPA:
7589             init_get_bits(&hx->s.gb, ptr, bit_length);
7590             hx->intra_gb_ptr=
7591             hx->inter_gb_ptr= NULL;
7592             hx->s.data_partitioning = 1;
7593
7594             err = decode_slice_header(hx, h);
7595             break;
7596         case NAL_DPB:
7597             init_get_bits(&hx->intra_gb, ptr, bit_length);
7598             hx->intra_gb_ptr= &hx->intra_gb;
7599             break;
7600         case NAL_DPC:
7601             init_get_bits(&hx->inter_gb, ptr, bit_length);
7602             hx->inter_gb_ptr= &hx->inter_gb;
7603
7604             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7605                && s->context_initialized
7606                && s->hurry_up < 5
7607                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7608                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7609                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7610                && avctx->skip_frame < AVDISCARD_ALL)
7611                 context_count++;
7612             break;
7613         case NAL_SEI:
7614             init_get_bits(&s->gb, ptr, bit_length);
7615             ff_h264_decode_sei(h);
7616             break;
7617         case NAL_SPS:
7618             init_get_bits(&s->gb, ptr, bit_length);
7619             ff_h264_decode_seq_parameter_set(h);
7620
7621             if(s->flags& CODEC_FLAG_LOW_DELAY)
7622                 s->low_delay=1;
7623
7624             if(avctx->has_b_frames < 2)
7625                 avctx->has_b_frames= !s->low_delay;
7626             break;
7627         case NAL_PPS:
7628             init_get_bits(&s->gb, ptr, bit_length);
7629
7630             ff_h264_decode_picture_parameter_set(h, bit_length);
7631
7632             break;
7633         case NAL_AUD:
7634         case NAL_END_SEQUENCE:
7635         case NAL_END_STREAM:
7636         case NAL_FILLER_DATA:
7637         case NAL_SPS_EXT:
7638         case NAL_AUXILIARY_SLICE:
7639             break;
7640         default:
7641             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7642         }
7643
7644         if(context_count == h->max_contexts) {
7645             execute_decode_slices(h, context_count);
7646             context_count = 0;
7647         }
7648
7649         if (err < 0)
7650             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7651         else if(err == 1) {
7652             /* Slice could not be decoded in parallel mode, copy down
7653              * NAL unit stuff to context 0 and restart. Note that
7654              * rbsp_buffer is not transferred, but since we no longer
7655              * run in parallel mode this should not be an issue. */
7656             h->nal_unit_type = hx->nal_unit_type;
7657             h->nal_ref_idc   = hx->nal_ref_idc;
7658             hx = h;
7659             goto again;
7660         }
7661     }
7662     if(context_count)
7663         execute_decode_slices(h, context_count);
7664     return buf_index;
7665 }
7666
7667 /**
7668  * returns the number of bytes consumed for building the current frame
7669  */
7670 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7671         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7672         if(pos+10>buf_size) pos=buf_size; // oops ;)
7673
7674         return pos;
7675 }
7676
7677 static int decode_frame(AVCodecContext *avctx,
7678                              void *data, int *data_size,
7679                              AVPacket *avpkt)
7680 {
7681     const uint8_t *buf = avpkt->data;
7682     int buf_size = avpkt->size;
7683     H264Context *h = avctx->priv_data;
7684     MpegEncContext *s = &h->s;
7685     AVFrame *pict = data;
7686     int buf_index;
7687
7688     s->flags= avctx->flags;
7689     s->flags2= avctx->flags2;
7690
7691    /* end of stream, output what is still in the buffers */
7692     if (buf_size == 0) {
7693         Picture *out;
7694         int i, out_idx;
7695
7696 //FIXME factorize this with the output code below
7697         out = h->delayed_pic[0];
7698         out_idx = 0;
7699         for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7700             if(h->delayed_pic[i]->poc < out->poc){
7701                 out = h->delayed_pic[i];
7702                 out_idx = i;
7703             }
7704
7705         for(i=out_idx; h->delayed_pic[i]; i++)
7706             h->delayed_pic[i] = h->delayed_pic[i+1];
7707
7708         if(out){
7709             *data_size = sizeof(AVFrame);
7710             *pict= *(AVFrame*)out;
7711         }
7712
7713         return 0;
7714     }
7715
7716     if(h->is_avc && !h->got_avcC) {
7717         int i, cnt, nalsize;
7718         unsigned char *p = avctx->extradata;
7719         if(avctx->extradata_size < 7) {
7720             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7721             return -1;
7722         }
7723         if(*p != 1) {
7724             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7725             return -1;
7726         }
7727         /* sps and pps in the avcC always have length coded with 2 bytes,
7728            so put a fake nal_length_size = 2 while parsing them */
7729         h->nal_length_size = 2;
7730         // Decode sps from avcC
7731         cnt = *(p+5) & 0x1f; // Number of sps
7732         p += 6;
7733         for (i = 0; i < cnt; i++) {
7734             nalsize = AV_RB16(p) + 2;
7735             if(decode_nal_units(h, p, nalsize) < 0) {
7736                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7737                 return -1;
7738             }
7739             p += nalsize;
7740         }
7741         // Decode pps from avcC
7742         cnt = *(p++); // Number of pps
7743         for (i = 0; i < cnt; i++) {
7744             nalsize = AV_RB16(p) + 2;
7745             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7746                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7747                 return -1;
7748             }
7749             p += nalsize;
7750         }
7751         // Now store right nal length size, that will be use to parse all other nals
7752         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7753         // Do not reparse avcC
7754         h->got_avcC = 1;
7755     }
7756
7757     if(!h->got_avcC && !h->is_avc && s->avctx->extradata_size){
7758         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7759             return -1;
7760         h->got_avcC = 1;
7761     }
7762
7763     buf_index=decode_nal_units(h, buf, buf_size);
7764     if(buf_index < 0)
7765         return -1;
7766
7767     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7768         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7769         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7770         return -1;
7771     }
7772
7773     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7774         Picture *out = s->current_picture_ptr;
7775         Picture *cur = s->current_picture_ptr;
7776         int i, pics, cross_idr, out_of_order, out_idx;
7777
7778         field_end(h);
7779
7780         if (cur->field_poc[0]==INT_MAX || cur->field_poc[1]==INT_MAX) {
7781             /* Wait for second field. */
7782             *data_size = 0;
7783
7784         } else {
7785             cur->repeat_pict = 0;
7786
7787             /* Signal interlacing information externally. */
7788             /* Prioritize picture timing SEI information over used decoding process if it exists. */
7789             if (h->sei_ct_type)
7790                 cur->interlaced_frame = (h->sei_ct_type & (1<<1)) != 0;
7791             else
7792                 cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7793
7794             if(h->sps.pic_struct_present_flag){
7795                 switch (h->sei_pic_struct)
7796                 {
7797                 case SEI_PIC_STRUCT_TOP_BOTTOM_TOP:
7798                 case SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM:
7799                     // Signal the possibility of telecined film externally (pic_struct 5,6)
7800                     // From these hints, let the applications decide if they apply deinterlacing.
7801                     cur->repeat_pict = 1;
7802                     break;
7803                 case SEI_PIC_STRUCT_FRAME_DOUBLING:
7804                     // Force progressive here, as doubling interlaced frame is a bad idea.
7805                     cur->interlaced_frame = 0;
7806                     cur->repeat_pict = 2;
7807                     break;
7808                 case SEI_PIC_STRUCT_FRAME_TRIPLING:
7809                     cur->interlaced_frame = 0;
7810                     cur->repeat_pict = 4;
7811                     break;
7812                 }
7813             }else{
7814                 /* Derive interlacing flag from used decoding process. */
7815                 cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7816             }
7817
7818             if (cur->field_poc[0] != cur->field_poc[1]){
7819                 /* Derive top_field_first from field pocs. */
7820                 cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7821             }else{
7822                 if(cur->interlaced_frame || h->sps.pic_struct_present_flag){
7823                     /* Use picture timing SEI information. Even if it is a information of a past frame, better than nothing. */
7824                     if(h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM
7825                       || h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM_TOP)
7826                         cur->top_field_first = 1;
7827                     else
7828                         cur->top_field_first = 0;
7829                 }else{
7830                     /* Most likely progressive */
7831                     cur->top_field_first = 0;
7832                 }
7833             }
7834
7835         //FIXME do something with unavailable reference frames
7836
7837             /* Sort B-frames into display order */
7838
7839             if(h->sps.bitstream_restriction_flag
7840                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7841                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7842                 s->low_delay = 0;
7843             }
7844
7845             if(   s->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT
7846                && !h->sps.bitstream_restriction_flag){
7847                 s->avctx->has_b_frames= MAX_DELAYED_PIC_COUNT;
7848                 s->low_delay= 0;
7849             }
7850
7851             pics = 0;
7852             while(h->delayed_pic[pics]) pics++;
7853
7854             assert(pics <= MAX_DELAYED_PIC_COUNT);
7855
7856             h->delayed_pic[pics++] = cur;
7857             if(cur->reference == 0)
7858                 cur->reference = DELAYED_PIC_REF;
7859
7860             out = h->delayed_pic[0];
7861             out_idx = 0;
7862             for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7863                 if(h->delayed_pic[i]->poc < out->poc){
7864                     out = h->delayed_pic[i];
7865                     out_idx = i;
7866                 }
7867             cross_idr = !h->delayed_pic[0]->poc || !!h->delayed_pic[i] || h->delayed_pic[0]->key_frame;
7868
7869             out_of_order = !cross_idr && out->poc < h->outputed_poc;
7870
7871             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7872                 { }
7873             else if((out_of_order && pics-1 == s->avctx->has_b_frames && s->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT)
7874                || (s->low_delay &&
7875                 ((!cross_idr && out->poc > h->outputed_poc + 2)
7876                  || cur->pict_type == FF_B_TYPE)))
7877             {
7878                 s->low_delay = 0;
7879                 s->avctx->has_b_frames++;
7880             }
7881
7882             if(out_of_order || pics > s->avctx->has_b_frames){
7883                 out->reference &= ~DELAYED_PIC_REF;
7884                 for(i=out_idx; h->delayed_pic[i]; i++)
7885                     h->delayed_pic[i] = h->delayed_pic[i+1];
7886             }
7887             if(!out_of_order && pics > s->avctx->has_b_frames){
7888                 *data_size = sizeof(AVFrame);
7889
7890                 h->outputed_poc = out->poc;
7891                 *pict= *(AVFrame*)out;
7892             }else{
7893                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7894             }
7895         }
7896     }
7897
7898     assert(pict->data[0] || !*data_size);
7899     ff_print_debug_info(s, pict);
7900 //printf("out %d\n", (int)pict->data[0]);
7901 #if 0 //?
7902
7903     /* Return the Picture timestamp as the frame number */
7904     /* we subtract 1 because it is added on utils.c     */
7905     avctx->frame_number = s->picture_number - 1;
7906 #endif
7907     return get_consumed_bytes(s, buf_index, buf_size);
7908 }
7909 #if 0
7910 static inline void fill_mb_avail(H264Context *h){
7911     MpegEncContext * const s = &h->s;
7912     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7913
7914     if(s->mb_y){
7915         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7916         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7917         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7918     }else{
7919         h->mb_avail[0]=
7920         h->mb_avail[1]=
7921         h->mb_avail[2]= 0;
7922     }
7923     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7924     h->mb_avail[4]= 1; //FIXME move out
7925     h->mb_avail[5]= 0; //FIXME move out
7926 }
7927 #endif
7928
7929 #ifdef TEST
7930 #undef printf
7931 #undef random
7932 #define COUNT 8000
7933 #define SIZE (COUNT*40)
7934 int main(void){
7935     int i;
7936     uint8_t temp[SIZE];
7937     PutBitContext pb;
7938     GetBitContext gb;
7939 //    int int_temp[10000];
7940     DSPContext dsp;
7941     AVCodecContext avctx;
7942
7943     dsputil_init(&dsp, &avctx);
7944
7945     init_put_bits(&pb, temp, SIZE);
7946     printf("testing unsigned exp golomb\n");
7947     for(i=0; i<COUNT; i++){
7948         START_TIMER
7949         set_ue_golomb(&pb, i);
7950         STOP_TIMER("set_ue_golomb");
7951     }
7952     flush_put_bits(&pb);
7953
7954     init_get_bits(&gb, temp, 8*SIZE);
7955     for(i=0; i<COUNT; i++){
7956         int j, s;
7957
7958         s= show_bits(&gb, 24);
7959
7960         START_TIMER
7961         j= get_ue_golomb(&gb);
7962         if(j != i){
7963             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7964 //            return -1;
7965         }
7966         STOP_TIMER("get_ue_golomb");
7967     }
7968
7969
7970     init_put_bits(&pb, temp, SIZE);
7971     printf("testing signed exp golomb\n");
7972     for(i=0; i<COUNT; i++){
7973         START_TIMER
7974         set_se_golomb(&pb, i - COUNT/2);
7975         STOP_TIMER("set_se_golomb");
7976     }
7977     flush_put_bits(&pb);
7978
7979     init_get_bits(&gb, temp, 8*SIZE);
7980     for(i=0; i<COUNT; i++){
7981         int j, s;
7982
7983         s= show_bits(&gb, 24);
7984
7985         START_TIMER
7986         j= get_se_golomb(&gb);
7987         if(j != i - COUNT/2){
7988             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7989 //            return -1;
7990         }
7991         STOP_TIMER("get_se_golomb");
7992     }
7993
7994 #if 0
7995     printf("testing 4x4 (I)DCT\n");
7996
7997     DCTELEM block[16];
7998     uint8_t src[16], ref[16];
7999     uint64_t error= 0, max_error=0;
8000
8001     for(i=0; i<COUNT; i++){
8002         int j;
8003 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
8004         for(j=0; j<16; j++){
8005             ref[j]= random()%255;
8006             src[j]= random()%255;
8007         }
8008
8009         h264_diff_dct_c(block, src, ref, 4);
8010
8011         //normalize
8012         for(j=0; j<16; j++){
8013 //            printf("%d ", block[j]);
8014             block[j]= block[j]*4;
8015             if(j&1) block[j]= (block[j]*4 + 2)/5;
8016             if(j&4) block[j]= (block[j]*4 + 2)/5;
8017         }
8018 //        printf("\n");
8019
8020         s->dsp.h264_idct_add(ref, block, 4);
8021 /*        for(j=0; j<16; j++){
8022             printf("%d ", ref[j]);
8023         }
8024         printf("\n");*/
8025
8026         for(j=0; j<16; j++){
8027             int diff= FFABS(src[j] - ref[j]);
8028
8029             error+= diff*diff;
8030             max_error= FFMAX(max_error, diff);
8031         }
8032     }
8033     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
8034     printf("testing quantizer\n");
8035     for(qp=0; qp<52; qp++){
8036         for(i=0; i<16; i++)
8037             src1_block[i]= src2_block[i]= random()%255;
8038
8039     }
8040     printf("Testing NAL layer\n");
8041
8042     uint8_t bitstream[COUNT];
8043     uint8_t nal[COUNT*2];
8044     H264Context h;
8045     memset(&h, 0, sizeof(H264Context));
8046
8047     for(i=0; i<COUNT; i++){
8048         int zeros= i;
8049         int nal_length;
8050         int consumed;
8051         int out_length;
8052         uint8_t *out;
8053         int j;
8054
8055         for(j=0; j<COUNT; j++){
8056             bitstream[j]= (random() % 255) + 1;
8057         }
8058
8059         for(j=0; j<zeros; j++){
8060             int pos= random() % COUNT;
8061             while(bitstream[pos] == 0){
8062                 pos++;
8063                 pos %= COUNT;
8064             }
8065             bitstream[pos]=0;
8066         }
8067
8068         START_TIMER
8069
8070         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
8071         if(nal_length<0){
8072             printf("encoding failed\n");
8073             return -1;
8074         }
8075
8076         out= ff_h264_decode_nal(&h, nal, &out_length, &consumed, nal_length);
8077
8078         STOP_TIMER("NAL")
8079
8080         if(out_length != COUNT){
8081             printf("incorrect length %d %d\n", out_length, COUNT);
8082             return -1;
8083         }
8084
8085         if(consumed != nal_length){
8086             printf("incorrect consumed length %d %d\n", nal_length, consumed);
8087             return -1;
8088         }
8089
8090         if(memcmp(bitstream, out, COUNT)){
8091             printf("mismatch\n");
8092             return -1;
8093         }
8094     }
8095 #endif
8096
8097     printf("Testing RBSP\n");
8098
8099
8100     return 0;
8101 }
8102 #endif /* TEST */
8103
8104
8105 av_cold void ff_h264_free_context(H264Context *h)
8106 {
8107     int i;
8108
8109     av_freep(&h->rbsp_buffer[0]);
8110     av_freep(&h->rbsp_buffer[1]);
8111     free_tables(h); //FIXME cleanup init stuff perhaps
8112
8113     for(i = 0; i < MAX_SPS_COUNT; i++)
8114         av_freep(h->sps_buffers + i);
8115
8116     for(i = 0; i < MAX_PPS_COUNT; i++)
8117         av_freep(h->pps_buffers + i);
8118 }
8119
8120 static av_cold int decode_end(AVCodecContext *avctx)
8121 {
8122     H264Context *h = avctx->priv_data;
8123     MpegEncContext *s = &h->s;
8124
8125     ff_h264_free_context(h);
8126
8127     MPV_common_end(s);
8128
8129 //    memset(h, 0, sizeof(H264Context));
8130
8131     return 0;
8132 }
8133
8134
8135 AVCodec h264_decoder = {
8136     "h264",
8137     CODEC_TYPE_VIDEO,
8138     CODEC_ID_H264,
8139     sizeof(H264Context),
8140     decode_init,
8141     NULL,
8142     decode_end,
8143     decode_frame,
8144     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_DELAY,
8145     .flush= flush_dpb,
8146     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
8147     .pix_fmts= ff_hwaccel_pixfmt_list_420,
8148 };
8149
8150 #if CONFIG_H264_VDPAU_DECODER
8151 AVCodec h264_vdpau_decoder = {
8152     "h264_vdpau",
8153     CODEC_TYPE_VIDEO,
8154     CODEC_ID_H264,
8155     sizeof(H264Context),
8156     decode_init,
8157     NULL,
8158     decode_end,
8159     decode_frame,
8160     CODEC_CAP_DR1 | CODEC_CAP_DELAY | CODEC_CAP_HWACCEL_VDPAU,
8161     .flush= flush_dpb,
8162     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10 (VDPAU acceleration)"),
8163 };
8164 #endif
8165
8166 #if CONFIG_SVQ3_DECODER
8167 #include "svq3.c"
8168 #endif