libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file libavcodec/h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "internal.h"
  29 #include "dsputil.h"
  30 #include "avcodec.h"
  31 #include "mpegvideo.h"
  32 #include "h264.h"
  33 #include "h264data.h"
  34 #include "h264_parser.h"
  35 #include "golomb.h"
  36 #include "mathops.h"
  37 #include "rectangle.h"
  38 #include "vdpau_internal.h"
  39
  40 #include "cabac.h"
  41 #if ARCH_X86
  42 #include "x86/h264_i386.h"
  43 #endif
  44
  45 //#undef NDEBUG
  46 #include <assert.h>
  47
  48 /**
  49  * Value of Picture.reference when Picture is not a reference picture, but
  50  * is held for delayed output.
  51  */
  52 #define DELAYED_PIC_REF 4
  53
  54 static VLC coeff_token_vlc[4];
  55 static VLC_TYPE coeff_token_vlc_tables[520+332+280+256][2];
  56 static const int coeff_token_vlc_tables_size[4]={520,332,280,256};
  57
  58 static VLC chroma_dc_coeff_token_vlc;
  59 static VLC_TYPE chroma_dc_coeff_token_vlc_table[256][2];
  60 static const int chroma_dc_coeff_token_vlc_table_size = 256;
  61
  62 static VLC total_zeros_vlc[15];
  63 static VLC_TYPE total_zeros_vlc_tables[15][512][2];
  64 static const int total_zeros_vlc_tables_size = 512;
  65
  66 static VLC chroma_dc_total_zeros_vlc[3];
  67 static VLC_TYPE chroma_dc_total_zeros_vlc_tables[3][8][2];
  68 static const int chroma_dc_total_zeros_vlc_tables_size = 8;
  69
  70 static VLC run_vlc[6];
  71 static VLC_TYPE run_vlc_tables[6][8][2];
  72 static const int run_vlc_tables_size = 8;
  73
  74 static VLC run7_vlc;
  75 static VLC_TYPE run7_vlc_table[96][2];
  76 static const int run7_vlc_table_size = 96;
  77
  78 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  79 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  80 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  81 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  82 static Picture * remove_long(H264Context *h, int i, int ref_mask);
  83
  84 static av_always_inline uint32_t pack16to32(int a, int b){
  85 #ifdef WORDS_BIGENDIAN
  86    return (b&0xFFFF) + (a<<16);
  87 #else
  88    return (a&0xFFFF) + (b<<16);
  89 #endif
  90 }
  91
  92 static const uint8_t rem6[52]={
  93 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  94 };
  95
  96 static const uint8_t div6[52]={
  97 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  98 };
  99
 100 static const uint8_t left_block_options[4][8]={
 101     {0,1,2,3,7,10,8,11},
 102     {2,2,3,3,8,11,8,11},
 103     {0,0,1,1,7,10,7,10},
 104     {0,2,0,2,7,10,7,10}
 105 };
 106
 107 #define LEVEL_TAB_BITS 8
 108 static int8_t cavlc_level_tab[7][1<<LEVEL_TAB_BITS][2];
 109
 110 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 111     MpegEncContext * const s = &h->s;
 112     const int mb_xy= h->mb_xy;
 113     int topleft_xy, top_xy, topright_xy, left_xy[2];
 114     int topleft_type, top_type, topright_type, left_type[2];
 115     const uint8_t * left_block;
 116     int topleft_partition= -1;
 117     int i;
 118
 119     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
 120
 121     //FIXME deblocking could skip the intra and nnz parts.
 122     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
 123         return;
 124
 125     /* Wow, what a mess, why didn't they simplify the interlacing & intra
 126      * stuff, I can't imagine that these complex rules are worth it. */
 127
 128     topleft_xy = top_xy - 1;
 129     topright_xy= top_xy + 1;
 130     left_xy[1] = left_xy[0] = mb_xy-1;
 131     left_block = left_block_options[0];
 132     if(FRAME_MBAFF){
 133         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 134         const int top_pair_xy      = pair_xy     - s->mb_stride;
 135         const int topleft_pair_xy  = top_pair_xy - 1;
 136         const int topright_pair_xy = top_pair_xy + 1;
 137         const int topleft_mb_field_flag  = IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 138         const int top_mb_field_flag      = IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 139         const int topright_mb_field_flag = IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 140         const int left_mb_field_flag     = IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 141         const int curr_mb_field_flag     = IS_INTERLACED(mb_type);
 142         const int bottom = (s->mb_y & 1);
 143         tprintf(s->avctx, "fill_caches: curr_mb_field_flag:%d, left_mb_field_flag:%d, topleft_mb_field_flag:%d, top_mb_field_flag:%d, topright_mb_field_flag:%d\n", curr_mb_field_flag, left_mb_field_flag, topleft_mb_field_flag, top_mb_field_flag, topright_mb_field_flag);
 144
 145         if (curr_mb_field_flag && (bottom || top_mb_field_flag)){
 146             top_xy -= s->mb_stride;
 147         }
 148         if (curr_mb_field_flag && (bottom || topleft_mb_field_flag)){
 149             topleft_xy -= s->mb_stride;
 150         } else if(bottom && !curr_mb_field_flag && left_mb_field_flag) {
 151             topleft_xy += s->mb_stride;
 152             // take top left mv from the middle of the mb, as opposed to all other modes which use the bottom right partition
 153             topleft_partition = 0;
 154         }
 155         if (curr_mb_field_flag && (bottom || topright_mb_field_flag)){
 156             topright_xy -= s->mb_stride;
 157         }
 158         if (left_mb_field_flag != curr_mb_field_flag) {
 159             left_xy[1] = left_xy[0] = pair_xy - 1;
 160             if (curr_mb_field_flag) {
 161                 left_xy[1] += s->mb_stride;
 162                 left_block = left_block_options[3];
 163             } else {
 164                 left_block= left_block_options[2 - bottom];
 165             }
 166         }
 167     }
 168
 169     h->top_mb_xy = top_xy;
 170     h->left_mb_xy[0] = left_xy[0];
 171     h->left_mb_xy[1] = left_xy[1];
 172     if(for_deblock){
 173         topleft_type = 0;
 174         topright_type = 0;
 175         top_type     = h->slice_table[top_xy     ] < 0xFFFF ? s->current_picture.mb_type[top_xy]     : 0;
 176         left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0;
 177         left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0;
 178
 179         if(MB_MBAFF && !IS_INTRA(mb_type)){
 180             int list;
 181             for(list=0; list<h->list_count; list++){
 182                 //These values where changed for ease of performing MC, we need to change them back
 183                 //FIXME maybe we can make MC and loop filter use the same values or prevent
 184                 //the MC code from changing ref_cache and rather use a temporary array.
 185                 if(USES_LIST(mb_type,list)){
 186                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 187                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 188                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 189                     ref += h->b8_stride;
 190                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 191                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 192                 }
 193             }
 194         }
 195     }else{
 196         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 197         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 198         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 199         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 200         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 201
 202     if(IS_INTRA(mb_type)){
 203         int type_mask= h->pps.constrained_intra_pred ? IS_INTRA(-1) : -1;
 204         h->topleft_samples_available=
 205         h->top_samples_available=
 206         h->left_samples_available= 0xFFFF;
 207         h->topright_samples_available= 0xEEEA;
 208
 209         if(!(top_type & type_mask)){
 210             h->topleft_samples_available= 0xB3FF;
 211             h->top_samples_available= 0x33FF;
 212             h->topright_samples_available= 0x26EA;
 213         }
 214         if(IS_INTERLACED(mb_type) != IS_INTERLACED(left_type[0])){
 215             if(IS_INTERLACED(mb_type)){
 216                 if(!(left_type[0] & type_mask)){
 217                     h->topleft_samples_available&= 0xDFFF;
 218                     h->left_samples_available&= 0x5FFF;
 219                 }
 220                 if(!(left_type[1] & type_mask)){
 221                     h->topleft_samples_available&= 0xFF5F;
 222                     h->left_samples_available&= 0xFF5F;
 223                 }
 224             }else{
 225                 int left_typei = h->slice_table[left_xy[0] + s->mb_stride ] == h->slice_num
 226                                 ? s->current_picture.mb_type[left_xy[0] + s->mb_stride] : 0;
 227                 assert(left_xy[0] == left_xy[1]);
 228                 if(!((left_typei & type_mask) && (left_type[0] & type_mask))){
 229                     h->topleft_samples_available&= 0xDF5F;
 230                     h->left_samples_available&= 0x5F5F;
 231                 }
 232             }
 233         }else{
 234             if(!(left_type[0] & type_mask)){
 235                 h->topleft_samples_available&= 0xDF5F;
 236                 h->left_samples_available&= 0x5F5F;
 237             }
 238         }
 239
 240         if(!(topleft_type & type_mask))
 241             h->topleft_samples_available&= 0x7FFF;
 242
 243         if(!(topright_type & type_mask))
 244             h->topright_samples_available&= 0xFBFF;
 245
 246         if(IS_INTRA4x4(mb_type)){
 247             if(IS_INTRA4x4(top_type)){
 248                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 249                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 250                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 251                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 252             }else{
 253                 int pred;
 254                 if(!(top_type & type_mask))
 255                     pred= -1;
 256                 else{
 257                     pred= 2;
 258                 }
 259                 h->intra4x4_pred_mode_cache[4+8*0]=
 260                 h->intra4x4_pred_mode_cache[5+8*0]=
 261                 h->intra4x4_pred_mode_cache[6+8*0]=
 262                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 263             }
 264             for(i=0; i<2; i++){
 265                 if(IS_INTRA4x4(left_type[i])){
 266                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 267                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 268                 }else{
 269                     int pred;
 270                     if(!(left_type[i] & type_mask))
 271                         pred= -1;
 272                     else{
 273                         pred= 2;
 274                     }
 275                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 276                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 277                 }
 278             }
 279         }
 280     }
 281     }
 282
 283
 284 /*
 285 0 . T T. T T T T
 286 1 L . .L . . . .
 287 2 L . .L . . . .
 288 3 . T TL . . . .
 289 4 L . .L . . . .
 290 5 L . .. . . . .
 291 */
 292 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
 293     if(top_type){
 294         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 295         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 296         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 297         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 298
 299         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 300         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 301
 302         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 303         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 304
 305     }else{
 306         h->non_zero_count_cache[4+8*0]=
 307         h->non_zero_count_cache[5+8*0]=
 308         h->non_zero_count_cache[6+8*0]=
 309         h->non_zero_count_cache[7+8*0]=
 310
 311         h->non_zero_count_cache[1+8*0]=
 312         h->non_zero_count_cache[2+8*0]=
 313
 314         h->non_zero_count_cache[1+8*3]=
 315         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 316
 317     }
 318
 319     for (i=0; i<2; i++) {
 320         if(left_type[i]){
 321             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 322             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 323             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 324             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 325         }else{
 326             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 327             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 328             h->non_zero_count_cache[0+8*1 +   8*i]=
 329             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 330         }
 331     }
 332
 333     if( h->pps.cabac ) {
 334         // top_cbp
 335         if(top_type) {
 336             h->top_cbp = h->cbp_table[top_xy];
 337         } else if(IS_INTRA(mb_type)) {
 338             h->top_cbp = 0x1C0;
 339         } else {
 340             h->top_cbp = 0;
 341         }
 342         // left_cbp
 343         if (left_type[0]) {
 344             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 345         } else if(IS_INTRA(mb_type)) {
 346             h->left_cbp = 0x1C0;
 347         } else {
 348             h->left_cbp = 0;
 349         }
 350         if (left_type[0]) {
 351             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 352         }
 353         if (left_type[1]) {
 354             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 355         }
 356     }
 357
 358 #if 1
 359     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 360         int list;
 361         for(list=0; list<h->list_count; list++){
 362             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 363                 /*if(!h->mv_cache_clean[list]){
 364                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 365                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 366                     h->mv_cache_clean[list]= 1;
 367                 }*/
 368                 continue;
 369             }
 370             h->mv_cache_clean[list]= 0;
 371
 372             if(USES_LIST(top_type, list)){
 373                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 374                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 375                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 376                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 377                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 378                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 379                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 380                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 381                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 382                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 383             }else{
 384                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 385                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 386                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 387                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 388                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 389             }
 390
 391             for(i=0; i<2; i++){
 392                 int cache_idx = scan8[0] - 1 + i*2*8;
 393                 if(USES_LIST(left_type[i], list)){
 394                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 395                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 396                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 397                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 398                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 399                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 400                 }else{
 401                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 402                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 403                     h->ref_cache[list][cache_idx  ]=
 404                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 405                 }
 406             }
 407
 408             if(for_deblock || ((IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred) && !FRAME_MBAFF))
 409                 continue;
 410
 411             if(USES_LIST(topleft_type, list)){
 412                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 413                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 414                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 415                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 416             }else{
 417                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 418                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 419             }
 420
 421             if(USES_LIST(topright_type, list)){
 422                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 423                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 424                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 425                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 426             }else{
 427                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 428                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 429             }
 430
 431             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 432                 continue;
 433
 434             h->ref_cache[list][scan8[5 ]+1] =
 435             h->ref_cache[list][scan8[7 ]+1] =
 436             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 437             h->ref_cache[list][scan8[4 ]] =
 438             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 439             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 440             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 441             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 442             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 443             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 444
 445             if( h->pps.cabac ) {
 446                 /* XXX beurk, Load mvd */
 447                 if(USES_LIST(top_type, list)){
 448                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 449                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 450                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 451                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 452                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 453                 }else{
 454                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 455                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 456                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 457                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 458                 }
 459                 if(USES_LIST(left_type[0], list)){
 460                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 461                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 462                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 463                 }else{
 464                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 465                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 466                 }
 467                 if(USES_LIST(left_type[1], list)){
 468                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 469                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 470                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 471                 }else{
 472                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 473                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 474                 }
 475                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 476                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 477                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 478                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 479                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 480
 481                 if(h->slice_type_nos == FF_B_TYPE){
 482                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 483
 484                     if(IS_DIRECT(top_type)){
 485                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 486                     }else if(IS_8X8(top_type)){
 487                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 488                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 489                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 490                     }else{
 491                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 492                     }
 493
 494                     if(IS_DIRECT(left_type[0]))
 495                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 496                     else if(IS_8X8(left_type[0]))
 497                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 498                     else
 499                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 500
 501                     if(IS_DIRECT(left_type[1]))
 502                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 503                     else if(IS_8X8(left_type[1]))
 504                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 505                     else
 506                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 507                 }
 508             }
 509
 510             if(FRAME_MBAFF){
 511 #define MAP_MVS\
 512                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 513                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 514                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 515                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 516                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 517                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 518                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 519                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 520                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 521                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 522                 if(MB_FIELD){
 523 #define MAP_F2F(idx, mb_type)\
 524                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 525                         h->ref_cache[list][idx] <<= 1;\
 526                         h->mv_cache[list][idx][1] /= 2;\
 527                         h->mvd_cache[list][idx][1] /= 2;\
 528                     }
 529                     MAP_MVS
 530 #undef MAP_F2F
 531                 }else{
 532 #define MAP_F2F(idx, mb_type)\
 533                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 534                         h->ref_cache[list][idx] >>= 1;\
 535                         h->mv_cache[list][idx][1] <<= 1;\
 536                         h->mvd_cache[list][idx][1] <<= 1;\
 537                     }
 538                     MAP_MVS
 539 #undef MAP_F2F
 540                 }
 541             }
 542         }
 543     }
 544 #endif
 545
 546     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 547 }
 548
 549 static inline void write_back_intra_pred_mode(H264Context *h){
 550     const int mb_xy= h->mb_xy;
 551
 552     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 553     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 554     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 555     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 556     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 557     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 558     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 559 }
 560
 561 /**
 562  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 563  */
 564 static inline int check_intra4x4_pred_mode(H264Context *h){
 565     MpegEncContext * const s = &h->s;
 566     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 567     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 568     int i;
 569
 570     if(!(h->top_samples_available&0x8000)){
 571         for(i=0; i<4; i++){
 572             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 573             if(status<0){
 574                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 575                 return -1;
 576             } else if(status){
 577                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 578             }
 579         }
 580     }
 581
 582     if((h->left_samples_available&0x8888)!=0x8888){
 583         static const int mask[4]={0x8000,0x2000,0x80,0x20};
 584         for(i=0; i<4; i++){
 585             if(!(h->left_samples_available&mask[i])){
 586                 int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 587                 if(status<0){
 588                     av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 589                     return -1;
 590                 } else if(status){
 591                     h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 592                 }
 593             }
 594         }
 595     }
 596
 597     return 0;
 598 } //FIXME cleanup like next
 599
 600 /**
 601  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 602  */
 603 static inline int check_intra_pred_mode(H264Context *h, int mode){
 604     MpegEncContext * const s = &h->s;
 605     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 606     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 607
 608     if(mode > 6U) {
 609         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 610         return -1;
 611     }
 612
 613     if(!(h->top_samples_available&0x8000)){
 614         mode= top[ mode ];
 615         if(mode<0){
 616             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 617             return -1;
 618         }
 619     }
 620
 621     if((h->left_samples_available&0x8080) != 0x8080){
 622         mode= left[ mode ];
 623         if(h->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred
 624             mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(h->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8);
 625         }
 626         if(mode<0){
 627             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 628             return -1;
 629         }
 630     }
 631
 632     return mode;
 633 }
 634
 635 /**
 636  * gets the predicted intra4x4 prediction mode.
 637  */
 638 static inline int pred_intra_mode(H264Context *h, int n){
 639     const int index8= scan8[n];
 640     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 641     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 642     const int min= FFMIN(left, top);
 643
 644     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 645
 646     if(min<0) return DC_PRED;
 647     else      return min;
 648 }
 649
 650 static inline void write_back_non_zero_count(H264Context *h){
 651     const int mb_xy= h->mb_xy;
 652
 653     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 654     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 655     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 656     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 657     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 658     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 659     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 660
 661     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 662     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 663     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 664
 665     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 666     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 667     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 668 }
 669
 670 /**
 671  * gets the predicted number of non-zero coefficients.
 672  * @param n block index
 673  */
 674 static inline int pred_non_zero_count(H264Context *h, int n){
 675     const int index8= scan8[n];
 676     const int left= h->non_zero_count_cache[index8 - 1];
 677     const int top = h->non_zero_count_cache[index8 - 8];
 678     int i= left + top;
 679
 680     if(i<64) i= (i+1)>>1;
 681
 682     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 683
 684     return i&31;
 685 }
 686
 687 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 688     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 689     MpegEncContext *s = &h->s;
 690
 691     /* there is no consistent mapping of mvs to neighboring locations that will
 692      * make mbaff happy, so we can't move all this logic to fill_caches */
 693     if(FRAME_MBAFF){
 694         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 695         const int16_t *mv;
 696         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 697         *C = h->mv_cache[list][scan8[0]-2];
 698
 699         if(!MB_FIELD
 700            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 701             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 702             if(IS_INTERLACED(mb_types[topright_xy])){
 703 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 704                 const int x4 = X4, y4 = Y4;\
 705                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 706                 if(!USES_LIST(mb_type,list))\
 707                     return LIST_NOT_USED;\
 708                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 709                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 710                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 711                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 712
 713                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 714             }
 715         }
 716         if(topright_ref == PART_NOT_AVAILABLE
 717            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 718            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 719             if(!MB_FIELD
 720                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 721                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 722             }
 723             if(MB_FIELD
 724                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 725                && i >= scan8[0]+8){
 726                 // left shift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's OK.
 727                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 728             }
 729         }
 730 #undef SET_DIAG_MV
 731     }
 732
 733     if(topright_ref != PART_NOT_AVAILABLE){
 734         *C= h->mv_cache[list][ i - 8 + part_width ];
 735         return topright_ref;
 736     }else{
 737         tprintf(s->avctx, "topright MV not available\n");
 738
 739         *C= h->mv_cache[list][ i - 8 - 1 ];
 740         return h->ref_cache[list][ i - 8 - 1 ];
 741     }
 742 }
 743
 744 /**
 745  * gets the predicted MV.
 746  * @param n the block index
 747  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 748  * @param mx the x component of the predicted motion vector
 749  * @param my the y component of the predicted motion vector
 750  */
 751 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 752     const int index8= scan8[n];
 753     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 754     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 755     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 756     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 757     const int16_t * C;
 758     int diagonal_ref, match_count;
 759
 760     assert(part_width==1 || part_width==2 || part_width==4);
 761
 762 /* mv_cache
 763   B . . A T T T T
 764   U . . L . . , .
 765   U . . L . . . .
 766   U . . L . . , .
 767   . . . L . . . .
 768 */
 769
 770     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 771     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 772     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 773     if(match_count > 1){ //most common
 774         *mx= mid_pred(A[0], B[0], C[0]);
 775         *my= mid_pred(A[1], B[1], C[1]);
 776     }else if(match_count==1){
 777         if(left_ref==ref){
 778             *mx= A[0];
 779             *my= A[1];
 780         }else if(top_ref==ref){
 781             *mx= B[0];
 782             *my= B[1];
 783         }else{
 784             *mx= C[0];
 785             *my= C[1];
 786         }
 787     }else{
 788         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 789             *mx= A[0];
 790             *my= A[1];
 791         }else{
 792             *mx= mid_pred(A[0], B[0], C[0]);
 793             *my= mid_pred(A[1], B[1], C[1]);
 794         }
 795     }
 796
 797     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 798 }
 799
 800 /**
 801  * gets the directionally predicted 16x8 MV.
 802  * @param n the block index
 803  * @param mx the x component of the predicted motion vector
 804  * @param my the y component of the predicted motion vector
 805  */
 806 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 807     if(n==0){
 808         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 809         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 810
 811         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 812
 813         if(top_ref == ref){
 814             *mx= B[0];
 815             *my= B[1];
 816             return;
 817         }
 818     }else{
 819         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 820         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 821
 822         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 823
 824         if(left_ref == ref){
 825             *mx= A[0];
 826             *my= A[1];
 827             return;
 828         }
 829     }
 830
 831     //RARE
 832     pred_motion(h, n, 4, list, ref, mx, my);
 833 }
 834
 835 /**
 836  * gets the directionally predicted 8x16 MV.
 837  * @param n the block index
 838  * @param mx the x component of the predicted motion vector
 839  * @param my the y component of the predicted motion vector
 840  */
 841 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 842     if(n==0){
 843         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 844         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 845
 846         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 847
 848         if(left_ref == ref){
 849             *mx= A[0];
 850             *my= A[1];
 851             return;
 852         }
 853     }else{
 854         const int16_t * C;
 855         int diagonal_ref;
 856
 857         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 858
 859         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 860
 861         if(diagonal_ref == ref){
 862             *mx= C[0];
 863             *my= C[1];
 864             return;
 865         }
 866     }
 867
 868     //RARE
 869     pred_motion(h, n, 2, list, ref, mx, my);
 870 }
 871
 872 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 873     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 874     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 875
 876     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 877
 878     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 879        || !( top_ref | *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ])
 880        || !(left_ref | *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ])){
 881
 882         *mx = *my = 0;
 883         return;
 884     }
 885
 886     pred_motion(h, 0, 4, 0, 0, mx, my);
 887
 888     return;
 889 }
 890
 891 static int get_scale_factor(H264Context * const h, int poc, int poc1, int i){
 892     int poc0 = h->ref_list[0][i].poc;
 893     int td = av_clip(poc1 - poc0, -128, 127);
 894     if(td == 0 || h->ref_list[0][i].long_ref){
 895         return 256;
 896     }else{
 897         int tb = av_clip(poc - poc0, -128, 127);
 898         int tx = (16384 + (FFABS(td) >> 1)) / td;
 899         return av_clip((tb*tx + 32) >> 6, -1024, 1023);
 900     }
 901 }
 902
 903 static inline void direct_dist_scale_factor(H264Context * const h){
 904     MpegEncContext * const s = &h->s;
 905     const int poc = h->s.current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
 906     const int poc1 = h->ref_list[1][0].poc;
 907     int i, field;
 908     for(field=0; field<2; field++){
 909         const int poc  = h->s.current_picture_ptr->field_poc[field];
 910         const int poc1 = h->ref_list[1][0].field_poc[field];
 911         for(i=0; i < 2*h->ref_count[0]; i++)
 912             h->dist_scale_factor_field[field][i^field] = get_scale_factor(h, poc, poc1, i+16);
 913     }
 914
 915     for(i=0; i<h->ref_count[0]; i++){
 916         h->dist_scale_factor[i] = get_scale_factor(h, poc, poc1, i);
 917     }
 918 }
 919
 920 static void fill_colmap(H264Context *h, int map[2][16+32], int list, int field, int colfield, int mbafi){
 921     MpegEncContext * const s = &h->s;
 922     Picture * const ref1 = &h->ref_list[1][0];
 923     int j, old_ref, rfield;
 924     int start= mbafi ? 16                      : 0;
 925     int end  = mbafi ? 16+2*h->ref_count[list] : h->ref_count[list];
 926     int interl= mbafi || s->picture_structure != PICT_FRAME;
 927
 928     /* bogus; fills in for missing frames */
 929     memset(map[list], 0, sizeof(map[list]));
 930
 931     for(rfield=0; rfield<2; rfield++){
 932         for(old_ref=0; old_ref<ref1->ref_count[colfield][list]; old_ref++){
 933             int poc = ref1->ref_poc[colfield][list][old_ref];
 934
 935             if     (!interl)
 936                 poc |= 3;
 937             else if( interl && (poc&3) == 3) //FIXME store all MBAFF references so this isnt needed
 938                 poc= (poc&~3) + rfield + 1;
 939
 940             for(j=start; j<end; j++){
 941                 if(4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3) == poc){
 942                     int cur_ref= mbafi ? (j-16)^field : j;
 943                     map[list][2*old_ref + (rfield^field) + 16] = cur_ref;
 944                     if(rfield == field)
 945                         map[list][old_ref] = cur_ref;
 946                     break;
 947                 }
 948             }
 949         }
 950     }
 951 }
 952
 953 static inline void direct_ref_list_init(H264Context * const h){
 954     MpegEncContext * const s = &h->s;
 955     Picture * const ref1 = &h->ref_list[1][0];
 956     Picture * const cur = s->current_picture_ptr;
 957     int list, j, field;
 958     int sidx= (s->picture_structure&1)^1;
 959     int ref1sidx= (ref1->reference&1)^1;
 960
 961     for(list=0; list<2; list++){
 962         cur->ref_count[sidx][list] = h->ref_count[list];
 963         for(j=0; j<h->ref_count[list]; j++)
 964             cur->ref_poc[sidx][list][j] = 4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3);
 965     }
 966
 967     if(s->picture_structure == PICT_FRAME){
 968         memcpy(cur->ref_count[1], cur->ref_count[0], sizeof(cur->ref_count[0]));
 969         memcpy(cur->ref_poc  [1], cur->ref_poc  [0], sizeof(cur->ref_poc  [0]));
 970     }
 971
 972     cur->mbaff= FRAME_MBAFF;
 973
 974     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 975         return;
 976
 977     for(list=0; list<2; list++){
 978         fill_colmap(h, h->map_col_to_list0, list, sidx, ref1sidx, 0);
 979         for(field=0; field<2; field++)
 980             fill_colmap(h, h->map_col_to_list0_field[field], list, field, field, 1);
 981     }
 982 }
 983
 984 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 985     MpegEncContext * const s = &h->s;
 986     int b8_stride = h->b8_stride;
 987     int b4_stride = h->b_stride;
 988     int mb_xy = h->mb_xy;
 989     int mb_type_col[2];
 990     const int16_t (*l1mv0)[2], (*l1mv1)[2];
 991     const int8_t *l1ref0, *l1ref1;
 992     const int is_b8x8 = IS_8X8(*mb_type);
 993     unsigned int sub_mb_type;
 994     int i8, i4;
 995
 996     assert(h->ref_list[1][0].reference&3);
 997
 998 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
 999
1000     if(IS_INTERLACED(h->ref_list[1][0].mb_type[mb_xy])){ // AFL/AFR/FR/FL -> AFL/FL
1001         if(!IS_INTERLACED(*mb_type)){                    //     AFR/FR    -> AFL/FL
1002             int cur_poc = s->current_picture_ptr->poc;
1003             int *col_poc = h->ref_list[1]->field_poc;
1004             int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
1005             mb_xy= s->mb_x + ((s->mb_y&~1) + col_parity)*s->mb_stride;
1006             b8_stride = 0;
1007         }else if(!(s->picture_structure & h->ref_list[1][0].reference) && !h->ref_list[1][0].mbaff){// FL -> FL & differ parity
1008             int fieldoff= 2*(h->ref_list[1][0].reference)-3;
1009             mb_xy += s->mb_stride*fieldoff;
1010         }
1011         goto single_col;
1012     }else{                                               // AFL/AFR/FR/FL -> AFR/FR
1013         if(IS_INTERLACED(*mb_type)){                     // AFL       /FL -> AFR/FR
1014             mb_xy= s->mb_x + (s->mb_y&~1)*s->mb_stride;
1015             mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy];
1016             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride];
1017             b8_stride *= 3;
1018             b4_stride *= 6;
1019             //FIXME IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag
1020             if(    (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)
1021                 && (mb_type_col[1] & MB_TYPE_16x16_OR_INTRA)
1022                 && !is_b8x8){
1023                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1024                 *mb_type   |= MB_TYPE_16x8 |MB_TYPE_L0L1|MB_TYPE_DIRECT2; /* B_16x8 */
1025             }else{
1026                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1027                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1028             }
1029         }else{                                           //     AFR/FR    -> AFR/FR
1030 single_col:
1031             mb_type_col[0] =
1032             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy];
1033             if(IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag){
1034                 /* FIXME save sub mb types from previous frames (or derive from MVs)
1035                 * so we know exactly what block size to use */
1036                 sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1037                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1038             }else if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
1039                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1040                 *mb_type   |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1041             }else{
1042                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1043                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1044             }
1045         }
1046     }
1047
1048     l1mv0  = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]];
1049     l1mv1  = &h->ref_list[1][0].motion_val[1][h->mb2b_xy [mb_xy]];
1050     l1ref0 = &h->ref_list[1][0].ref_index [0][h->mb2b8_xy[mb_xy]];
1051     l1ref1 = &h->ref_list[1][0].ref_index [1][h->mb2b8_xy[mb_xy]];
1052     if(!b8_stride){
1053         if(s->mb_y&1){
1054             l1ref0 += h->b8_stride;
1055             l1ref1 += h->b8_stride;
1056             l1mv0  +=  2*b4_stride;
1057             l1mv1  +=  2*b4_stride;
1058         }
1059     }
1060
1061     if(h->direct_spatial_mv_pred){
1062         int ref[2];
1063         int mv[2][2];
1064         int list;
1065
1066         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1067
1068         /* ref = min(neighbors) */
1069         for(list=0; list<2; list++){
1070             int refa = h->ref_cache[list][scan8[0] - 1];
1071             int refb = h->ref_cache[list][scan8[0] - 8];
1072             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1073             if(refc == PART_NOT_AVAILABLE)
1074                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1075             ref[list] = FFMIN3((unsigned)refa, (unsigned)refb, (unsigned)refc);
1076             if(ref[list] < 0)
1077                 ref[list] = -1;
1078         }
1079
1080         if(ref[0] < 0 && ref[1] < 0){
1081             ref[0] = ref[1] = 0;
1082             mv[0][0] = mv[0][1] =
1083             mv[1][0] = mv[1][1] = 0;
1084         }else{
1085             for(list=0; list<2; list++){
1086                 if(ref[list] >= 0)
1087                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1088                 else
1089                     mv[list][0] = mv[list][1] = 0;
1090             }
1091         }
1092
1093         if(ref[1] < 0){
1094             if(!is_b8x8)
1095                 *mb_type &= ~MB_TYPE_L1;
1096             sub_mb_type &= ~MB_TYPE_L1;
1097         }else if(ref[0] < 0){
1098             if(!is_b8x8)
1099                 *mb_type &= ~MB_TYPE_L0;
1100             sub_mb_type &= ~MB_TYPE_L0;
1101         }
1102
1103         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1104             for(i8=0; i8<4; i8++){
1105                 int x8 = i8&1;
1106                 int y8 = i8>>1;
1107                 int xy8 = x8+y8*b8_stride;
1108                 int xy4 = 3*x8+y8*b4_stride;
1109                 int a=0, b=0;
1110
1111                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1112                     continue;
1113                 h->sub_mb_type[i8] = sub_mb_type;
1114
1115                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1116                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1117                 if(!IS_INTRA(mb_type_col[y8])
1118                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1119                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1120                     if(ref[0] > 0)
1121                         a= pack16to32(mv[0][0],mv[0][1]);
1122                     if(ref[1] > 0)
1123                         b= pack16to32(mv[1][0],mv[1][1]);
1124                 }else{
1125                     a= pack16to32(mv[0][0],mv[0][1]);
1126                     b= pack16to32(mv[1][0],mv[1][1]);
1127                 }
1128                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1129                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1130             }
1131         }else if(IS_16X16(*mb_type)){
1132             int a=0, b=0;
1133
1134             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1135             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1136             if(!IS_INTRA(mb_type_col[0])
1137                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1138                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1139                        && (h->x264_build>33 || !h->x264_build)))){
1140                 if(ref[0] > 0)
1141                     a= pack16to32(mv[0][0],mv[0][1]);
1142                 if(ref[1] > 0)
1143                     b= pack16to32(mv[1][0],mv[1][1]);
1144             }else{
1145                 a= pack16to32(mv[0][0],mv[0][1]);
1146                 b= pack16to32(mv[1][0],mv[1][1]);
1147             }
1148             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1149             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1150         }else{
1151             for(i8=0; i8<4; i8++){
1152                 const int x8 = i8&1;
1153                 const int y8 = i8>>1;
1154
1155                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1156                     continue;
1157                 h->sub_mb_type[i8] = sub_mb_type;
1158
1159                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1160                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1161                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1162                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1163
1164                 /* col_zero_flag */
1165                 if(!IS_INTRA(mb_type_col[0]) && (   l1ref0[x8 + y8*b8_stride] == 0
1166                                               || (l1ref0[x8 + y8*b8_stride] < 0 && l1ref1[x8 + y8*b8_stride] == 0
1167                                                   && (h->x264_build>33 || !h->x264_build)))){
1168                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*b8_stride] == 0 ? l1mv0 : l1mv1;
1169                     if(IS_SUB_8X8(sub_mb_type)){
1170                         const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1171                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1172                             if(ref[0] == 0)
1173                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1174                             if(ref[1] == 0)
1175                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1176                         }
1177                     }else
1178                     for(i4=0; i4<4; i4++){
1179                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1180                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1181                             if(ref[0] == 0)
1182                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1183                             if(ref[1] == 0)
1184                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1185                         }
1186                     }
1187                 }
1188             }
1189         }
1190     }else{ /* direct temporal mv pred */
1191         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1192         const int *dist_scale_factor = h->dist_scale_factor;
1193         int ref_offset= 0;
1194
1195         if(FRAME_MBAFF && IS_INTERLACED(*mb_type)){
1196             map_col_to_list0[0] = h->map_col_to_list0_field[s->mb_y&1][0];
1197             map_col_to_list0[1] = h->map_col_to_list0_field[s->mb_y&1][1];
1198             dist_scale_factor   =h->dist_scale_factor_field[s->mb_y&1];
1199         }
1200         if(h->ref_list[1][0].mbaff && IS_INTERLACED(mb_type_col[0]))
1201             ref_offset += 16;
1202
1203         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1204             /* FIXME assumes direct_8x8_inference == 1 */
1205             int y_shift  = 2*!IS_INTERLACED(*mb_type);
1206
1207             for(i8=0; i8<4; i8++){
1208                 const int x8 = i8&1;
1209                 const int y8 = i8>>1;
1210                 int ref0, scale;
1211                 const int16_t (*l1mv)[2]= l1mv0;
1212
1213                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1214                     continue;
1215                 h->sub_mb_type[i8] = sub_mb_type;
1216
1217                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1218                 if(IS_INTRA(mb_type_col[y8])){
1219                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1220                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1221                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1222                     continue;
1223                 }
1224
1225                 ref0 = l1ref0[x8 + y8*b8_stride];
1226                 if(ref0 >= 0)
1227                     ref0 = map_col_to_list0[0][ref0 + ref_offset];
1228                 else{
1229                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1230                     l1mv= l1mv1;
1231                 }
1232                 scale = dist_scale_factor[ref0];
1233                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1234
1235                 {
1236                     const int16_t *mv_col = l1mv[x8*3 + y8*b4_stride];
1237                     int my_col = (mv_col[1]<<y_shift)/2;
1238                     int mx = (scale * mv_col[0] + 128) >> 8;
1239                     int my = (scale * my_col + 128) >> 8;
1240                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1241                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1242                 }
1243             }
1244             return;
1245         }
1246
1247         /* one-to-one mv scaling */
1248
1249         if(IS_16X16(*mb_type)){
1250             int ref, mv0, mv1;
1251
1252             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1253             if(IS_INTRA(mb_type_col[0])){
1254                 ref=mv0=mv1=0;
1255             }else{
1256                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0] + ref_offset]
1257                                                 : map_col_to_list0[1][l1ref1[0] + ref_offset];
1258                 const int scale = dist_scale_factor[ref0];
1259                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1260                 int mv_l0[2];
1261                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1262                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1263                 ref= ref0;
1264                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1265                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1266             }
1267             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1268             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1269             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1270         }else{
1271             for(i8=0; i8<4; i8++){
1272                 const int x8 = i8&1;
1273                 const int y8 = i8>>1;
1274                 int ref0, scale;
1275                 const int16_t (*l1mv)[2]= l1mv0;
1276
1277                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1278                     continue;
1279                 h->sub_mb_type[i8] = sub_mb_type;
1280                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1281                 if(IS_INTRA(mb_type_col[0])){
1282                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1283                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1284                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1285                     continue;
1286                 }
1287
1288                 ref0 = l1ref0[x8 + y8*b8_stride] + ref_offset;
1289                 if(ref0 >= 0)
1290                     ref0 = map_col_to_list0[0][ref0];
1291                 else{
1292                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1293                     l1mv= l1mv1;
1294                 }
1295                 scale = dist_scale_factor[ref0];
1296
1297                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1298                 if(IS_SUB_8X8(sub_mb_type)){
1299                     const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1300                     int mx = (scale * mv_col[0] + 128) >> 8;
1301                     int my = (scale * mv_col[1] + 128) >> 8;
1302                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1303                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1304                 }else
1305                 for(i4=0; i4<4; i4++){
1306                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1307                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1308                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1309                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1310                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1311                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1312                 }
1313             }
1314         }
1315     }
1316 }
1317
1318 static inline void write_back_motion(H264Context *h, int mb_type){
1319     MpegEncContext * const s = &h->s;
1320     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1321     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1322     int list;
1323
1324     if(!USES_LIST(mb_type, 0))
1325         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1326
1327     for(list=0; list<h->list_count; list++){
1328         int y;
1329         if(!USES_LIST(mb_type, list))
1330             continue;
1331
1332         for(y=0; y<4; y++){
1333             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1334             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1335         }
1336         if( h->pps.cabac ) {
1337             if(IS_SKIP(mb_type))
1338                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1339             else
1340             for(y=0; y<4; y++){
1341                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1342                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1343             }
1344         }
1345
1346         {
1347             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1348             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1349             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1350             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1351             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1352         }
1353     }
1354
1355     if(h->slice_type_nos == FF_B_TYPE && h->pps.cabac){
1356         if(IS_8X8(mb_type)){
1357             uint8_t *direct_table = &h->direct_table[b8_xy];
1358             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1359             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1360             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1361         }
1362     }
1363 }
1364
1365 const uint8_t *ff_h264_decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1366     int i, si, di;
1367     uint8_t *dst;
1368     int bufidx;
1369
1370 //    src[0]&0x80;                //forbidden bit
1371     h->nal_ref_idc= src[0]>>5;
1372     h->nal_unit_type= src[0]&0x1F;
1373
1374     src++; length--;
1375 #if 0
1376     for(i=0; i<length; i++)
1377         printf("%2X ", src[i]);
1378 #endif
1379
1380 #if HAVE_FAST_UNALIGNED
1381 # if HAVE_FAST_64BIT
1382 #   define RS 7
1383     for(i=0; i+1<length; i+=9){
1384         if(!((~*(const uint64_t*)(src+i) & (*(const uint64_t*)(src+i) - 0x0100010001000101ULL)) & 0x8000800080008080ULL))
1385 # else
1386 #   define RS 3
1387     for(i=0; i+1<length; i+=5){
1388         if(!((~*(const uint32_t*)(src+i) & (*(const uint32_t*)(src+i) - 0x01000101U)) & 0x80008080U))
1389 # endif
1390             continue;
1391         if(i>0 && !src[i]) i--;
1392         while(src[i]) i++;
1393 #else
1394 #   define RS 0
1395     for(i=0; i+1<length; i+=2){
1396         if(src[i]) continue;
1397         if(i>0 && src[i-1]==0) i--;
1398 #endif
1399         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1400             if(src[i+2]!=3){
1401                 /* startcode, so we must be past the end */
1402                 length=i;
1403             }
1404             break;
1405         }
1406         i-= RS;
1407     }
1408
1409     if(i>=length-1){ //no escaped 0
1410         *dst_length= length;
1411         *consumed= length+1; //+1 for the header
1412         return src;
1413     }
1414
1415     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1416     av_fast_malloc(&h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length+FF_INPUT_BUFFER_PADDING_SIZE);
1417     dst= h->rbsp_buffer[bufidx];
1418
1419     if (dst == NULL){
1420         return NULL;
1421     }
1422
1423 //printf("decoding esc\n");
1424     memcpy(dst, src, i);
1425     si=di=i;
1426     while(si+2<length){
1427         //remove escapes (very rare 1:2^22)
1428         if(src[si+2]>3){
1429             dst[di++]= src[si++];
1430             dst[di++]= src[si++];
1431         }else if(src[si]==0 && src[si+1]==0){
1432             if(src[si+2]==3){ //escape
1433                 dst[di++]= 0;
1434                 dst[di++]= 0;
1435                 si+=3;
1436                 continue;
1437             }else //next start code
1438                 goto nsc;
1439         }
1440
1441         dst[di++]= src[si++];
1442     }
1443     while(si<length)
1444         dst[di++]= src[si++];
1445 nsc:
1446
1447     memset(dst+di, 0, FF_INPUT_BUFFER_PADDING_SIZE);
1448
1449     *dst_length= di;
1450     *consumed= si + 1;//+1 for the header
1451 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1452     return dst;
1453 }
1454
1455 int ff_h264_decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1456     int v= *src;
1457     int r;
1458
1459     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1460
1461     for(r=1; r<9; r++){
1462         if(v&1) return r;
1463         v>>=1;
1464     }
1465     return 0;
1466 }
1467
1468 /**
1469  * IDCT transforms the 16 dc values and dequantizes them.
1470  * @param qp quantization parameter
1471  */
1472 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1473 #define stride 16
1474     int i;
1475     int temp[16]; //FIXME check if this is a good idea
1476     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1477     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1478
1479 //memset(block, 64, 2*256);
1480 //return;
1481     for(i=0; i<4; i++){
1482         const int offset= y_offset[i];
1483         const int z0= block[offset+stride*0] + block[offset+stride*4];
1484         const int z1= block[offset+stride*0] - block[offset+stride*4];
1485         const int z2= block[offset+stride*1] - block[offset+stride*5];
1486         const int z3= block[offset+stride*1] + block[offset+stride*5];
1487
1488         temp[4*i+0]= z0+z3;
1489         temp[4*i+1]= z1+z2;
1490         temp[4*i+2]= z1-z2;
1491         temp[4*i+3]= z0-z3;
1492     }
1493
1494     for(i=0; i<4; i++){
1495         const int offset= x_offset[i];
1496         const int z0= temp[4*0+i] + temp[4*2+i];
1497         const int z1= temp[4*0+i] - temp[4*2+i];
1498         const int z2= temp[4*1+i] - temp[4*3+i];
1499         const int z3= temp[4*1+i] + temp[4*3+i];
1500
1501         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
1502         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1503         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1504         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1505     }
1506 }
1507
1508 #if 0
1509 /**
1510  * DCT transforms the 16 dc values.
1511  * @param qp quantization parameter ??? FIXME
1512  */
1513 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1514 //    const int qmul= dequant_coeff[qp][0];
1515     int i;
1516     int temp[16]; //FIXME check if this is a good idea
1517     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1518     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1519
1520     for(i=0; i<4; i++){
1521         const int offset= y_offset[i];
1522         const int z0= block[offset+stride*0] + block[offset+stride*4];
1523         const int z1= block[offset+stride*0] - block[offset+stride*4];
1524         const int z2= block[offset+stride*1] - block[offset+stride*5];
1525         const int z3= block[offset+stride*1] + block[offset+stride*5];
1526
1527         temp[4*i+0]= z0+z3;
1528         temp[4*i+1]= z1+z2;
1529         temp[4*i+2]= z1-z2;
1530         temp[4*i+3]= z0-z3;
1531     }
1532
1533     for(i=0; i<4; i++){
1534         const int offset= x_offset[i];
1535         const int z0= temp[4*0+i] + temp[4*2+i];
1536         const int z1= temp[4*0+i] - temp[4*2+i];
1537         const int z2= temp[4*1+i] - temp[4*3+i];
1538         const int z3= temp[4*1+i] + temp[4*3+i];
1539
1540         block[stride*0 +offset]= (z0 + z3)>>1;
1541         block[stride*2 +offset]= (z1 + z2)>>1;
1542         block[stride*8 +offset]= (z1 - z2)>>1;
1543         block[stride*10+offset]= (z0 - z3)>>1;
1544     }
1545 }
1546 #endif
1547
1548 #undef xStride
1549 #undef stride
1550
1551 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1552     const int stride= 16*2;
1553     const int xStride= 16;
1554     int a,b,c,d,e;
1555
1556     a= block[stride*0 + xStride*0];
1557     b= block[stride*0 + xStride*1];
1558     c= block[stride*1 + xStride*0];
1559     d= block[stride*1 + xStride*1];
1560
1561     e= a-b;
1562     a= a+b;
1563     b= c-d;
1564     c= c+d;
1565
1566     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1567     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1568     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1569     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1570 }
1571
1572 #if 0
1573 static void chroma_dc_dct_c(DCTELEM *block){
1574     const int stride= 16*2;
1575     const int xStride= 16;
1576     int a,b,c,d,e;
1577
1578     a= block[stride*0 + xStride*0];
1579     b= block[stride*0 + xStride*1];
1580     c= block[stride*1 + xStride*0];
1581     d= block[stride*1 + xStride*1];
1582
1583     e= a-b;
1584     a= a+b;
1585     b= c-d;
1586     c= c+d;
1587
1588     block[stride*0 + xStride*0]= (a+c);
1589     block[stride*0 + xStride*1]= (e+b);
1590     block[stride*1 + xStride*0]= (a-c);
1591     block[stride*1 + xStride*1]= (e-b);
1592 }
1593 #endif
1594
1595 /**
1596  * gets the chroma qp.
1597  */
1598 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1599     return h->pps.chroma_qp_table[t][qscale];
1600 }
1601
1602 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1603                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1604                            int src_x_offset, int src_y_offset,
1605                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1606     MpegEncContext * const s = &h->s;
1607     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1608     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1609     const int luma_xy= (mx&3) + ((my&3)<<2);
1610     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1611     uint8_t * src_cb, * src_cr;
1612     int extra_width= h->emu_edge_width;
1613     int extra_height= h->emu_edge_height;
1614     int emu=0;
1615     const int full_mx= mx>>2;
1616     const int full_my= my>>2;
1617     const int pic_width  = 16*s->mb_width;
1618     const int pic_height = 16*s->mb_height >> MB_FIELD;
1619
1620     if(mx&7) extra_width -= 3;
1621     if(my&7) extra_height -= 3;
1622
1623     if(   full_mx < 0-extra_width
1624        || full_my < 0-extra_height
1625        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1626        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1627         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1628             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1629         emu=1;
1630     }
1631
1632     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1633     if(!square){
1634         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1635     }
1636
1637     if(CONFIG_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1638
1639     if(MB_FIELD){
1640         // chroma offset when predicting from a field of opposite parity
1641         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1642         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1643     }
1644     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1645     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1646
1647     if(emu){
1648         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1649             src_cb= s->edge_emu_buffer;
1650     }
1651     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1652
1653     if(emu){
1654         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1655             src_cr= s->edge_emu_buffer;
1656     }
1657     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1658 }
1659
1660 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1661                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1662                            int x_offset, int y_offset,
1663                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1664                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1665                            int list0, int list1){
1666     MpegEncContext * const s = &h->s;
1667     qpel_mc_func *qpix_op=  qpix_put;
1668     h264_chroma_mc_func chroma_op= chroma_put;
1669
1670     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1671     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1672     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1673     x_offset += 8*s->mb_x;
1674     y_offset += 8*(s->mb_y >> MB_FIELD);
1675
1676     if(list0){
1677         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1678         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1679                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1680                            qpix_op, chroma_op);
1681
1682         qpix_op=  qpix_avg;
1683         chroma_op= chroma_avg;
1684     }
1685
1686     if(list1){
1687         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1688         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1689                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1690                            qpix_op, chroma_op);
1691     }
1692 }
1693
1694 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1695                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1696                            int x_offset, int y_offset,
1697                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1698                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1699                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1700                            int list0, int list1){
1701     MpegEncContext * const s = &h->s;
1702
1703     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1704     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1705     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1706     x_offset += 8*s->mb_x;
1707     y_offset += 8*(s->mb_y >> MB_FIELD);
1708
1709     if(list0 && list1){
1710         /* don't optimize for luma-only case, since B-frames usually
1711          * use implicit weights => chroma too. */
1712         uint8_t *tmp_cb = s->obmc_scratchpad;
1713         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1714         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1715         int refn0 = h->ref_cache[0][ scan8[n] ];
1716         int refn1 = h->ref_cache[1][ scan8[n] ];
1717
1718         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1719                     dest_y, dest_cb, dest_cr,
1720                     x_offset, y_offset, qpix_put, chroma_put);
1721         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1722                     tmp_y, tmp_cb, tmp_cr,
1723                     x_offset, y_offset, qpix_put, chroma_put);
1724
1725         if(h->use_weight == 2){
1726             int weight0 = h->implicit_weight[refn0][refn1];
1727             int weight1 = 64 - weight0;
1728             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1729             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1730             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1731         }else{
1732             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1733                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1734                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1735             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1736                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1737                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1738             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1739                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1740                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1741         }
1742     }else{
1743         int list = list1 ? 1 : 0;
1744         int refn = h->ref_cache[list][ scan8[n] ];
1745         Picture *ref= &h->ref_list[list][refn];
1746         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1747                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1748                     qpix_put, chroma_put);
1749
1750         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1751                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1752         if(h->use_weight_chroma){
1753             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1754                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1755             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1756                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1757         }
1758     }
1759 }
1760
1761 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1762                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1763                            int x_offset, int y_offset,
1764                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1765                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1766                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1767                            int list0, int list1){
1768     if((h->use_weight==2 && list0 && list1
1769         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1770        || h->use_weight==1)
1771         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1772                          x_offset, y_offset, qpix_put, chroma_put,
1773                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1774     else
1775         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1776                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1777 }
1778
1779 static inline void prefetch_motion(H264Context *h, int list){
1780     /* fetch pixels for estimated mv 4 macroblocks ahead
1781      * optimized for 64byte cache lines */
1782     MpegEncContext * const s = &h->s;
1783     const int refn = h->ref_cache[list][scan8[0]];
1784     if(refn >= 0){
1785         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1786         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1787         uint8_t **src= h->ref_list[list][refn].data;
1788         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1789         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1790         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1791         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1792     }
1793 }
1794
1795 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1796                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1797                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1798                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1799     MpegEncContext * const s = &h->s;
1800     const int mb_xy= h->mb_xy;
1801     const int mb_type= s->current_picture.mb_type[mb_xy];
1802
1803     assert(IS_INTER(mb_type));
1804
1805     prefetch_motion(h, 0);
1806
1807     if(IS_16X16(mb_type)){
1808         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1809                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1810                 &weight_op[0], &weight_avg[0],
1811                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1812     }else if(IS_16X8(mb_type)){
1813         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1814                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1815                 &weight_op[1], &weight_avg[1],
1816                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1817         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1818                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1819                 &weight_op[1], &weight_avg[1],
1820                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1821     }else if(IS_8X16(mb_type)){
1822         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1823                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1824                 &weight_op[2], &weight_avg[2],
1825                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1826         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1827                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1828                 &weight_op[2], &weight_avg[2],
1829                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1830     }else{
1831         int i;
1832
1833         assert(IS_8X8(mb_type));
1834
1835         for(i=0; i<4; i++){
1836             const int sub_mb_type= h->sub_mb_type[i];
1837             const int n= 4*i;
1838             int x_offset= (i&1)<<2;
1839             int y_offset= (i&2)<<1;
1840
1841             if(IS_SUB_8X8(sub_mb_type)){
1842                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1843                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1844                     &weight_op[3], &weight_avg[3],
1845                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1846             }else if(IS_SUB_8X4(sub_mb_type)){
1847                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1848                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1849                     &weight_op[4], &weight_avg[4],
1850                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1851                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1852                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1853                     &weight_op[4], &weight_avg[4],
1854                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1855             }else if(IS_SUB_4X8(sub_mb_type)){
1856                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1857                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1858                     &weight_op[5], &weight_avg[5],
1859                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1860                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1861                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1862                     &weight_op[5], &weight_avg[5],
1863                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1864             }else{
1865                 int j;
1866                 assert(IS_SUB_4X4(sub_mb_type));
1867                 for(j=0; j<4; j++){
1868                     int sub_x_offset= x_offset + 2*(j&1);
1869                     int sub_y_offset= y_offset +   (j&2);
1870                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1871                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1872                         &weight_op[6], &weight_avg[6],
1873                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1874                 }
1875             }
1876         }
1877     }
1878
1879     prefetch_motion(h, 1);
1880 }
1881
1882 static av_cold void init_cavlc_level_tab(void){
1883     int suffix_length, mask;
1884     unsigned int i;
1885
1886     for(suffix_length=0; suffix_length<7; suffix_length++){
1887         for(i=0; i<(1<<LEVEL_TAB_BITS); i++){
1888             int prefix= LEVEL_TAB_BITS - av_log2(2*i);
1889             int level_code= (prefix<<suffix_length) + (i>>(LEVEL_TAB_BITS-prefix-1-suffix_length)) - (1<<suffix_length);
1890
1891             mask= -(level_code&1);
1892             level_code= (((2+level_code)>>1) ^ mask) - mask;
1893             if(prefix + 1 + suffix_length <= LEVEL_TAB_BITS){
1894                 cavlc_level_tab[suffix_length][i][0]= level_code;
1895                 cavlc_level_tab[suffix_length][i][1]= prefix + 1 + suffix_length;
1896             }else if(prefix + 1 <= LEVEL_TAB_BITS){
1897                 cavlc_level_tab[suffix_length][i][0]= prefix+100;
1898                 cavlc_level_tab[suffix_length][i][1]= prefix + 1;
1899             }else{
1900                 cavlc_level_tab[suffix_length][i][0]= LEVEL_TAB_BITS+100;
1901                 cavlc_level_tab[suffix_length][i][1]= LEVEL_TAB_BITS;
1902             }
1903         }
1904     }
1905 }
1906
1907 static av_cold void decode_init_vlc(void){
1908     static int done = 0;
1909
1910     if (!done) {
1911         int i;
1912         int offset;
1913         done = 1;
1914
1915         chroma_dc_coeff_token_vlc.table = chroma_dc_coeff_token_vlc_table;
1916         chroma_dc_coeff_token_vlc.table_allocated = chroma_dc_coeff_token_vlc_table_size;
1917         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1918                  &chroma_dc_coeff_token_len [0], 1, 1,
1919                  &chroma_dc_coeff_token_bits[0], 1, 1,
1920                  INIT_VLC_USE_NEW_STATIC);
1921
1922         offset = 0;
1923         for(i=0; i<4; i++){
1924             coeff_token_vlc[i].table = coeff_token_vlc_tables+offset;
1925             coeff_token_vlc[i].table_allocated = coeff_token_vlc_tables_size[i];
1926             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1927                      &coeff_token_len [i][0], 1, 1,
1928                      &coeff_token_bits[i][0], 1, 1,
1929                      INIT_VLC_USE_NEW_STATIC);
1930             offset += coeff_token_vlc_tables_size[i];
1931         }
1932         /*
1933          * This is a one time safety check to make sure that
1934          * the packed static coeff_token_vlc table sizes
1935          * were initialized correctly.
1936          */
1937         assert(offset == FF_ARRAY_ELEMS(coeff_token_vlc_tables));
1938
1939         for(i=0; i<3; i++){
1940             chroma_dc_total_zeros_vlc[i].table = chroma_dc_total_zeros_vlc_tables[i];
1941             chroma_dc_total_zeros_vlc[i].table_allocated = chroma_dc_total_zeros_vlc_tables_size;
1942             init_vlc(&chroma_dc_total_zeros_vlc[i],
1943                      CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1944                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1945                      &chroma_dc_total_zeros_bits[i][0], 1, 1,
1946                      INIT_VLC_USE_NEW_STATIC);
1947         }
1948         for(i=0; i<15; i++){
1949             total_zeros_vlc[i].table = total_zeros_vlc_tables[i];
1950             total_zeros_vlc[i].table_allocated = total_zeros_vlc_tables_size;
1951             init_vlc(&total_zeros_vlc[i],
1952                      TOTAL_ZEROS_VLC_BITS, 16,
1953                      &total_zeros_len [i][0], 1, 1,
1954                      &total_zeros_bits[i][0], 1, 1,
1955                      INIT_VLC_USE_NEW_STATIC);
1956         }
1957
1958         for(i=0; i<6; i++){
1959             run_vlc[i].table = run_vlc_tables[i];
1960             run_vlc[i].table_allocated = run_vlc_tables_size;
1961             init_vlc(&run_vlc[i],
1962                      RUN_VLC_BITS, 7,
1963                      &run_len [i][0], 1, 1,
1964                      &run_bits[i][0], 1, 1,
1965                      INIT_VLC_USE_NEW_STATIC);
1966         }
1967         run7_vlc.table = run7_vlc_table,
1968         run7_vlc.table_allocated = run7_vlc_table_size;
1969         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
1970                  &run_len [6][0], 1, 1,
1971                  &run_bits[6][0], 1, 1,
1972                  INIT_VLC_USE_NEW_STATIC);
1973
1974         init_cavlc_level_tab();
1975     }
1976 }
1977
1978 static void free_tables(H264Context *h){
1979     int i;
1980     H264Context *hx;
1981     av_freep(&h->intra4x4_pred_mode);
1982     av_freep(&h->chroma_pred_mode_table);
1983     av_freep(&h->cbp_table);
1984     av_freep(&h->mvd_table[0]);
1985     av_freep(&h->mvd_table[1]);
1986     av_freep(&h->direct_table);
1987     av_freep(&h->non_zero_count);
1988     av_freep(&h->slice_table_base);
1989     h->slice_table= NULL;
1990
1991     av_freep(&h->mb2b_xy);
1992     av_freep(&h->mb2b8_xy);
1993
1994     for(i = 0; i < MAX_THREADS; i++) {
1995         hx = h->thread_context[i];
1996         if(!hx) continue;
1997         av_freep(&hx->top_borders[1]);
1998         av_freep(&hx->top_borders[0]);
1999         av_freep(&hx->s.obmc_scratchpad);
2000     }
2001 }
2002
2003 static void init_dequant8_coeff_table(H264Context *h){
2004     int i,q,x;
2005     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2006     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2007     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2008
2009     for(i=0; i<2; i++ ){
2010         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2011             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2012             break;
2013         }
2014
2015         for(q=0; q<52; q++){
2016             int shift = div6[q];
2017             int idx = rem6[q];
2018             for(x=0; x<64; x++)
2019                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2020                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2021                     h->pps.scaling_matrix8[i][x]) << shift;
2022         }
2023     }
2024 }
2025
2026 static void init_dequant4_coeff_table(H264Context *h){
2027     int i,j,q,x;
2028     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2029     for(i=0; i<6; i++ ){
2030         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2031         for(j=0; j<i; j++){
2032             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2033                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2034                 break;
2035             }
2036         }
2037         if(j<i)
2038             continue;
2039
2040         for(q=0; q<52; q++){
2041             int shift = div6[q] + 2;
2042             int idx = rem6[q];
2043             for(x=0; x<16; x++)
2044                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2045                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2046                     h->pps.scaling_matrix4[i][x]) << shift;
2047         }
2048     }
2049 }
2050
2051 static void init_dequant_tables(H264Context *h){
2052     int i,x;
2053     init_dequant4_coeff_table(h);
2054     if(h->pps.transform_8x8_mode)
2055         init_dequant8_coeff_table(h);
2056     if(h->sps.transform_bypass){
2057         for(i=0; i<6; i++)
2058             for(x=0; x<16; x++)
2059                 h->dequant4_coeff[i][0][x] = 1<<6;
2060         if(h->pps.transform_8x8_mode)
2061             for(i=0; i<2; i++)
2062                 for(x=0; x<64; x++)
2063                     h->dequant8_coeff[i][0][x] = 1<<6;
2064     }
2065 }
2066
2067
2068 /**
2069  * allocates tables.
2070  * needs width/height
2071  */
2072 static int alloc_tables(H264Context *h){
2073     MpegEncContext * const s = &h->s;
2074     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2075     int x,y;
2076
2077     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2078
2079     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2080     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(*h->slice_table_base))
2081     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2082
2083     CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2084     CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2085     CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2086     CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2087
2088     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(*h->slice_table_base));
2089     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2090
2091     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2092     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2093     for(y=0; y<s->mb_height; y++){
2094         for(x=0; x<s->mb_width; x++){
2095             const int mb_xy= x + y*s->mb_stride;
2096             const int b_xy = 4*x + 4*y*h->b_stride;
2097             const int b8_xy= 2*x + 2*y*h->b8_stride;
2098
2099             h->mb2b_xy [mb_xy]= b_xy;
2100             h->mb2b8_xy[mb_xy]= b8_xy;
2101         }
2102     }
2103
2104     s->obmc_scratchpad = NULL;
2105
2106     if(!h->dequant4_coeff[0])
2107         init_dequant_tables(h);
2108
2109     return 0;
2110 fail:
2111     free_tables(h);
2112     return -1;
2113 }
2114
2115 /**
2116  * Mimic alloc_tables(), but for every context thread.
2117  */
2118 static void clone_tables(H264Context *dst, H264Context *src){
2119     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2120     dst->non_zero_count           = src->non_zero_count;
2121     dst->slice_table              = src->slice_table;
2122     dst->cbp_table                = src->cbp_table;
2123     dst->mb2b_xy                  = src->mb2b_xy;
2124     dst->mb2b8_xy                 = src->mb2b8_xy;
2125     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2126     dst->mvd_table[0]             = src->mvd_table[0];
2127     dst->mvd_table[1]             = src->mvd_table[1];
2128     dst->direct_table             = src->direct_table;
2129
2130     dst->s.obmc_scratchpad = NULL;
2131     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2132 }
2133
2134 /**
2135  * Init context
2136  * Allocate buffers which are not shared amongst multiple threads.
2137  */
2138 static int context_init(H264Context *h){
2139     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2140     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2141
2142     return 0;
2143 fail:
2144     return -1; // free_tables will clean up for us
2145 }
2146
2147 static av_cold void common_init(H264Context *h){
2148     MpegEncContext * const s = &h->s;
2149
2150     s->width = s->avctx->width;
2151     s->height = s->avctx->height;
2152     s->codec_id= s->avctx->codec->id;
2153
2154     ff_h264_pred_init(&h->hpc, s->codec_id);
2155
2156     h->dequant_coeff_pps= -1;
2157     s->unrestricted_mv=1;
2158     s->decode=1; //FIXME
2159
2160     dsputil_init(&s->dsp, s->avctx); // needed so that idct permutation is known early
2161
2162     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2163     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2164 }
2165
2166 /**
2167  * Reset SEI values at the beginning of the frame.
2168  *
2169  * @param h H.264 context.
2170  */
2171 static void reset_sei(H264Context *h) {
2172     h->sei_recovery_frame_cnt       = -1;
2173     h->sei_dpb_output_delay         =  0;
2174     h->sei_cpb_removal_delay        = -1;
2175     h->sei_buffering_period_present =  0;
2176 }
2177
2178 static av_cold int decode_init(AVCodecContext *avctx){
2179     H264Context *h= avctx->priv_data;
2180     MpegEncContext * const s = &h->s;
2181
2182     MPV_decode_defaults(s);
2183
2184     s->avctx = avctx;
2185     common_init(h);
2186
2187     s->out_format = FMT_H264;
2188     s->workaround_bugs= avctx->workaround_bugs;
2189
2190     // set defaults
2191 //    s->decode_mb= ff_h263_decode_mb;
2192     s->quarter_sample = 1;
2193     if(!avctx->has_b_frames)
2194     s->low_delay= 1;
2195
2196     if(s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
2197         avctx->pix_fmt= PIX_FMT_VDPAU_H264;
2198     else
2199         avctx->pix_fmt= avctx->get_format(avctx, avctx->codec->pix_fmts);
2200     avctx->hwaccel = ff_find_hwaccel(avctx->codec->id, avctx->pix_fmt);
2201
2202     decode_init_vlc();
2203
2204     if(avctx->extradata_size > 0 && avctx->extradata &&
2205        *(char *)avctx->extradata == 1){
2206         h->is_avc = 1;
2207         h->got_avcC = 0;
2208     } else {
2209         h->is_avc = 0;
2210     }
2211
2212     h->thread_context[0] = h;
2213     h->outputed_poc = INT_MIN;
2214     h->prev_poc_msb= 1<<16;
2215     reset_sei(h);
2216     if(avctx->codec_id == CODEC_ID_H264){
2217         if(avctx->ticks_per_frame == 1){
2218             s->avctx->time_base.den *=2;
2219         }
2220         avctx->ticks_per_frame = 2;
2221     }
2222     return 0;
2223 }
2224
2225 static int frame_start(H264Context *h){
2226     MpegEncContext * const s = &h->s;
2227     int i;
2228
2229     if(MPV_frame_start(s, s->avctx) < 0)
2230         return -1;
2231     ff_er_frame_start(s);
2232     /*
2233      * MPV_frame_start uses pict_type to derive key_frame.
2234      * This is incorrect for H.264; IDR markings must be used.
2235      * Zero here; IDR markings per slice in frame or fields are ORed in later.
2236      * See decode_nal_units().
2237      */
2238     s->current_picture_ptr->key_frame= 0;
2239
2240     assert(s->linesize && s->uvlinesize);
2241
2242     for(i=0; i<16; i++){
2243         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2244         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2245     }
2246     for(i=0; i<4; i++){
2247         h->block_offset[16+i]=
2248         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2249         h->block_offset[24+16+i]=
2250         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2251     }
2252
2253     /* can't be in alloc_tables because linesize isn't known there.
2254      * FIXME: redo bipred weight to not require extra buffer? */
2255     for(i = 0; i < s->avctx->thread_count; i++)
2256         if(!h->thread_context[i]->s.obmc_scratchpad)
2257             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2258
2259     /* some macroblocks will be accessed before they're available */
2260     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2261         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(*h->slice_table));
2262
2263 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2264
2265     // We mark the current picture as non-reference after allocating it, so
2266     // that if we break out due to an error it can be released automatically
2267     // in the next MPV_frame_start().
2268     // SVQ3 as well as most other codecs have only last/next/current and thus
2269     // get released even with set reference, besides SVQ3 and others do not
2270     // mark frames as reference later "naturally".
2271     if(s->codec_id != CODEC_ID_SVQ3)
2272         s->current_picture_ptr->reference= 0;
2273
2274     s->current_picture_ptr->field_poc[0]=
2275     s->current_picture_ptr->field_poc[1]= INT_MAX;
2276     assert(s->current_picture_ptr->long_ref==0);
2277
2278     return 0;
2279 }
2280
2281 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2282     MpegEncContext * const s = &h->s;
2283     int i;
2284     int step    = 1;
2285     int offset  = 1;
2286     int uvoffset= 1;
2287     int top_idx = 1;
2288     int skiplast= 0;
2289
2290     src_y  -=   linesize;
2291     src_cb -= uvlinesize;
2292     src_cr -= uvlinesize;
2293
2294     if(!simple && FRAME_MBAFF){
2295         if(s->mb_y&1){
2296             offset  = MB_MBAFF ? 1 : 17;
2297             uvoffset= MB_MBAFF ? 1 : 9;
2298             if(!MB_MBAFF){
2299                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 0)= *(uint64_t*)(src_y +  15*linesize);
2300                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 8)= *(uint64_t*)(src_y +8+15*linesize);
2301                 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2302                     *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+7*uvlinesize);
2303                     *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+7*uvlinesize);
2304                 }
2305             }
2306         }else{
2307             if(!MB_MBAFF){
2308                 h->left_border[0]= h->top_borders[0][s->mb_x][15];
2309                 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2310                     h->left_border[34   ]= h->top_borders[0][s->mb_x][16+7  ];
2311                     h->left_border[34+18]= h->top_borders[0][s->mb_x][16+8+7];
2312                 }
2313                 skiplast= 1;
2314             }
2315             offset  =
2316             uvoffset=
2317             top_idx = MB_MBAFF ? 0 : 1;
2318         }
2319         step= MB_MBAFF ? 2 : 1;
2320     }
2321
2322     // There are two lines saved, the line above the the top macroblock of a pair,
2323     // and the line above the bottom macroblock
2324     h->left_border[offset]= h->top_borders[top_idx][s->mb_x][15];
2325     for(i=1; i<17 - skiplast; i++){
2326         h->left_border[offset+i*step]= src_y[15+i*  linesize];
2327     }
2328
2329     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2330     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2331
2332     if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2333         h->left_border[uvoffset+34   ]= h->top_borders[top_idx][s->mb_x][16+7];
2334         h->left_border[uvoffset+34+18]= h->top_borders[top_idx][s->mb_x][24+7];
2335         for(i=1; i<9 - skiplast; i++){
2336             h->left_border[uvoffset+34   +i*step]= src_cb[7+i*uvlinesize];
2337             h->left_border[uvoffset+34+18+i*step]= src_cr[7+i*uvlinesize];
2338         }
2339         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2340         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2341     }
2342 }
2343
2344 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2345     MpegEncContext * const s = &h->s;
2346     int temp8, i;
2347     uint64_t temp64;
2348     int deblock_left;
2349     int deblock_top;
2350     int mb_xy;
2351     int step    = 1;
2352     int offset  = 1;
2353     int uvoffset= 1;
2354     int top_idx = 1;
2355
2356     if(!simple && FRAME_MBAFF){
2357         if(s->mb_y&1){
2358             offset  = MB_MBAFF ? 1 : 17;
2359             uvoffset= MB_MBAFF ? 1 : 9;
2360         }else{
2361             offset  =
2362             uvoffset=
2363             top_idx = MB_MBAFF ? 0 : 1;
2364         }
2365         step= MB_MBAFF ? 2 : 1;
2366     }
2367
2368     if(h->deblocking_filter == 2) {
2369         mb_xy = h->mb_xy;
2370         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2371         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2372     } else {
2373         deblock_left = (s->mb_x > 0);
2374         deblock_top =  (s->mb_y > !!MB_FIELD);
2375     }
2376
2377     src_y  -=   linesize + 1;
2378     src_cb -= uvlinesize + 1;
2379     src_cr -= uvlinesize + 1;
2380
2381 #define XCHG(a,b,t,xchg)\
2382 t= a;\
2383 if(xchg)\
2384     a= b;\
2385 b= t;
2386
2387     if(deblock_left){
2388         for(i = !deblock_top; i<16; i++){
2389             XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, xchg);
2390         }
2391         XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, 1);
2392     }
2393
2394     if(deblock_top){
2395         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2396         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2397         if(s->mb_x+1 < s->mb_width){
2398             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2399         }
2400     }
2401
2402     if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2403         if(deblock_left){
2404             for(i = !deblock_top; i<8; i++){
2405                 XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, xchg);
2406                 XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, xchg);
2407             }
2408             XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, 1);
2409             XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, 1);
2410         }
2411         if(deblock_top){
2412             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2413             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2414         }
2415     }
2416 }
2417
2418 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2419     MpegEncContext * const s = &h->s;
2420     const int mb_x= s->mb_x;
2421     const int mb_y= s->mb_y;
2422     const int mb_xy= h->mb_xy;
2423     const int mb_type= s->current_picture.mb_type[mb_xy];
2424     uint8_t  *dest_y, *dest_cb, *dest_cr;
2425     int linesize, uvlinesize /*dct_offset*/;
2426     int i;
2427     int *block_offset = &h->block_offset[0];
2428     const int transform_bypass = !simple && (s->qscale == 0 && h->sps.transform_bypass);
2429     /* is_h264 should always be true if SVQ3 is disabled. */
2430     const int is_h264 = !CONFIG_SVQ3_DECODER || simple || s->codec_id == CODEC_ID_H264;
2431     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2432     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2433
2434     dest_y  = s->current_picture.data[0] + (mb_x + mb_y * s->linesize  ) * 16;
2435     dest_cb = s->current_picture.data[1] + (mb_x + mb_y * s->uvlinesize) * 8;
2436     dest_cr = s->current_picture.data[2] + (mb_x + mb_y * s->uvlinesize) * 8;
2437
2438     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2439     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2440
2441     if (!simple && MB_FIELD) {
2442         linesize   = h->mb_linesize   = s->linesize * 2;
2443         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2444         block_offset = &h->block_offset[24];
2445         if(mb_y&1){ //FIXME move out of this function?
2446             dest_y -= s->linesize*15;
2447             dest_cb-= s->uvlinesize*7;
2448             dest_cr-= s->uvlinesize*7;
2449         }
2450         if(FRAME_MBAFF) {
2451             int list;
2452             for(list=0; list<h->list_count; list++){
2453                 if(!USES_LIST(mb_type, list))
2454                     continue;
2455                 if(IS_16X16(mb_type)){
2456                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2457                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2458                 }else{
2459                     for(i=0; i<16; i+=4){
2460                         int ref = h->ref_cache[list][scan8[i]];
2461                         if(ref >= 0)
2462                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2463                     }
2464                 }
2465             }
2466         }
2467     } else {
2468         linesize   = h->mb_linesize   = s->linesize;
2469         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2470 //        dct_offset = s->linesize * 16;
2471     }
2472
2473     if (!simple && IS_INTRA_PCM(mb_type)) {
2474         for (i=0; i<16; i++) {
2475             memcpy(dest_y + i*  linesize, h->mb       + i*8, 16);
2476         }
2477         for (i=0; i<8; i++) {
2478             memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
2479             memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
2480         }
2481     } else {
2482         if(IS_INTRA(mb_type)){
2483             if(h->deblocking_filter)
2484                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2485
2486             if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2487                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2488                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2489             }
2490
2491             if(IS_INTRA4x4(mb_type)){
2492                 if(simple || !s->encoding){
2493                     if(IS_8x8DCT(mb_type)){
2494                         if(transform_bypass){
2495                             idct_dc_add =
2496                             idct_add    = s->dsp.add_pixels8;
2497                         }else{
2498                             idct_dc_add = s->dsp.h264_idct8_dc_add;
2499                             idct_add    = s->dsp.h264_idct8_add;
2500                         }
2501                         for(i=0; i<16; i+=4){
2502                             uint8_t * const ptr= dest_y + block_offset[i];
2503                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2504                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2505                                 h->hpc.pred8x8l_add[dir](ptr, h->mb + i*16, linesize);
2506                             }else{
2507                                 const int nnz = h->non_zero_count_cache[ scan8[i] ];
2508                                 h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2509                                                             (h->topright_samples_available<<i)&0x4000, linesize);
2510                                 if(nnz){
2511                                     if(nnz == 1 && h->mb[i*16])
2512                                         idct_dc_add(ptr, h->mb + i*16, linesize);
2513                                     else
2514                                         idct_add   (ptr, h->mb + i*16, linesize);
2515                                 }
2516                             }
2517                         }
2518                     }else{
2519                         if(transform_bypass){
2520                             idct_dc_add =
2521                             idct_add    = s->dsp.add_pixels4;
2522                         }else{
2523                             idct_dc_add = s->dsp.h264_idct_dc_add;
2524                             idct_add    = s->dsp.h264_idct_add;
2525                         }
2526                         for(i=0; i<16; i++){
2527                             uint8_t * const ptr= dest_y + block_offset[i];
2528                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2529
2530                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2531                                 h->hpc.pred4x4_add[dir](ptr, h->mb + i*16, linesize);
2532                             }else{
2533                                 uint8_t *topright;
2534                                 int nnz, tr;
2535                                 if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2536                                     const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2537                                     assert(mb_y || linesize <= block_offset[i]);
2538                                     if(!topright_avail){
2539                                         tr= ptr[3 - linesize]*0x01010101;
2540                                         topright= (uint8_t*) &tr;
2541                                     }else
2542                                         topright= ptr + 4 - linesize;
2543                                 }else
2544                                     topright= NULL;
2545
2546                                 h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2547                                 nnz = h->non_zero_count_cache[ scan8[i] ];
2548                                 if(nnz){
2549                                     if(is_h264){
2550                                         if(nnz == 1 && h->mb[i*16])
2551                                             idct_dc_add(ptr, h->mb + i*16, linesize);
2552                                         else
2553                                             idct_add   (ptr, h->mb + i*16, linesize);
2554                                     }else
2555                                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2556                                 }
2557                             }
2558                         }
2559                     }
2560                 }
2561             }else{
2562                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2563                 if(is_h264){
2564                     if(!transform_bypass)
2565                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2566                 }else
2567                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2568             }
2569             if(h->deblocking_filter)
2570                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2571         }else if(is_h264){
2572             hl_motion(h, dest_y, dest_cb, dest_cr,
2573                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2574                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2575                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2576         }
2577
2578
2579         if(!IS_INTRA4x4(mb_type)){
2580             if(is_h264){
2581                 if(IS_INTRA16x16(mb_type)){
2582                     if(transform_bypass){
2583                         if(h->sps.profile_idc==244 && (h->intra16x16_pred_mode==VERT_PRED8x8 || h->intra16x16_pred_mode==HOR_PRED8x8)){
2584                             h->hpc.pred16x16_add[h->intra16x16_pred_mode](dest_y, block_offset, h->mb, linesize);
2585                         }else{
2586                             for(i=0; i<16; i++){
2587                                 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2588                                     s->dsp.add_pixels4(dest_y + block_offset[i], h->mb + i*16, linesize);
2589                             }
2590                         }
2591                     }else{
2592                          s->dsp.h264_idct_add16intra(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2593                     }
2594                 }else if(h->cbp&15){
2595                     if(transform_bypass){
2596                         const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2597                         idct_add= IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2598                         for(i=0; i<16; i+=di){
2599                             if(h->non_zero_count_cache[ scan8[i] ]){
2600                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2601                             }
2602                         }
2603                     }else{
2604                         if(IS_8x8DCT(mb_type)){
2605                             s->dsp.h264_idct8_add4(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2606                         }else{
2607                             s->dsp.h264_idct_add16(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2608                         }
2609                     }
2610                 }
2611             }else{
2612                 for(i=0; i<16; i++){
2613                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2614                         uint8_t * const ptr= dest_y + block_offset[i];
2615                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2616                     }
2617                 }
2618             }
2619         }
2620
2621         if((simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)) && (h->cbp&0x30)){
2622             uint8_t *dest[2] = {dest_cb, dest_cr};
2623             if(transform_bypass){
2624                 if(IS_INTRA(mb_type) && h->sps.profile_idc==244 && (h->chroma_pred_mode==VERT_PRED8x8 || h->chroma_pred_mode==HOR_PRED8x8)){
2625                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0], block_offset + 16, h->mb + 16*16, uvlinesize);
2626                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1], block_offset + 20, h->mb + 20*16, uvlinesize);
2627                 }else{
2628                     idct_add = s->dsp.add_pixels4;
2629                     for(i=16; i<16+8; i++){
2630                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2631                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2632                     }
2633                 }
2634             }else{
2635                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2636                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2637                 if(is_h264){
2638                     idct_add = s->dsp.h264_idct_add;
2639                     idct_dc_add = s->dsp.h264_idct_dc_add;
2640                     for(i=16; i<16+8; i++){
2641                         if(h->non_zero_count_cache[ scan8[i] ])
2642                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2643                         else if(h->mb[i*16])
2644                             idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2645                     }
2646                 }else{
2647                     for(i=16; i<16+8; i++){
2648                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2649                             uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2650                             svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2651                         }
2652                     }
2653                 }
2654             }
2655         }
2656     }
2657     if(h->cbp || IS_INTRA(mb_type))
2658         s->dsp.clear_blocks(h->mb);
2659
2660     if(h->deblocking_filter) {
2661         backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2662         fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2663         h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2664         h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2665         if (!simple && FRAME_MBAFF) {
2666             filter_mb     (h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2667         } else {
2668             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2669         }
2670     }
2671 }
2672
2673 /**
2674  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2675  */
2676 static void hl_decode_mb_simple(H264Context *h){
2677     hl_decode_mb_internal(h, 1);
2678 }
2679
2680 /**
2681  * Process a macroblock; this handles edge cases, such as interlacing.
2682  */
2683 static void av_noinline hl_decode_mb_complex(H264Context *h){
2684     hl_decode_mb_internal(h, 0);
2685 }
2686
2687 static void hl_decode_mb(H264Context *h){
2688     MpegEncContext * const s = &h->s;
2689     const int mb_xy= h->mb_xy;
2690     const int mb_type= s->current_picture.mb_type[mb_xy];
2691     int is_complex = CONFIG_SMALL || h->is_complex || IS_INTRA_PCM(mb_type) || s->qscale == 0;
2692
2693     if (is_complex)
2694         hl_decode_mb_complex(h);
2695     else hl_decode_mb_simple(h);
2696 }
2697
2698 static void pic_as_field(Picture *pic, const int parity){
2699     int i;
2700     for (i = 0; i < 4; ++i) {
2701         if (parity == PICT_BOTTOM_FIELD)
2702             pic->data[i] += pic->linesize[i];
2703         pic->reference = parity;
2704         pic->linesize[i] *= 2;
2705     }
2706     pic->poc= pic->field_poc[parity == PICT_BOTTOM_FIELD];
2707 }
2708
2709 static int split_field_copy(Picture *dest, Picture *src,
2710                             int parity, int id_add){
2711     int match = !!(src->reference & parity);
2712
2713     if (match) {
2714         *dest = *src;
2715         if(parity != PICT_FRAME){
2716             pic_as_field(dest, parity);
2717             dest->pic_id *= 2;
2718             dest->pic_id += id_add;
2719         }
2720     }
2721
2722     return match;
2723 }
2724
2725 static int build_def_list(Picture *def, Picture **in, int len, int is_long, int sel){
2726     int i[2]={0};
2727     int index=0;
2728
2729     while(i[0]<len || i[1]<len){
2730         while(i[0]<len && !(in[ i[0] ] && (in[ i[0] ]->reference & sel)))
2731             i[0]++;
2732         while(i[1]<len && !(in[ i[1] ] && (in[ i[1] ]->reference & (sel^3))))
2733             i[1]++;
2734         if(i[0] < len){
2735             in[ i[0] ]->pic_id= is_long ? i[0] : in[ i[0] ]->frame_num;
2736             split_field_copy(&def[index++], in[ i[0]++ ], sel  , 1);
2737         }
2738         if(i[1] < len){
2739             in[ i[1] ]->pic_id= is_long ? i[1] : in[ i[1] ]->frame_num;
2740             split_field_copy(&def[index++], in[ i[1]++ ], sel^3, 0);
2741         }
2742     }
2743
2744     return index;
2745 }
2746
2747 static int add_sorted(Picture **sorted, Picture **src, int len, int limit, int dir){
2748     int i, best_poc;
2749     int out_i= 0;
2750
2751     for(;;){
2752         best_poc= dir ? INT_MIN : INT_MAX;
2753
2754         for(i=0; i<len; i++){
2755             const int poc= src[i]->poc;
2756             if(((poc > limit) ^ dir) && ((poc < best_poc) ^ dir)){
2757                 best_poc= poc;
2758                 sorted[out_i]= src[i];
2759             }
2760         }
2761         if(best_poc == (dir ? INT_MIN : INT_MAX))
2762             break;
2763         limit= sorted[out_i++]->poc - dir;
2764     }
2765     return out_i;
2766 }
2767
2768 /**
2769  * fills the default_ref_list.
2770  */
2771 static int fill_default_ref_list(H264Context *h){
2772     MpegEncContext * const s = &h->s;
2773     int i, len;
2774
2775     if(h->slice_type_nos==FF_B_TYPE){
2776         Picture *sorted[32];
2777         int cur_poc, list;
2778         int lens[2];
2779
2780         if(FIELD_PICTURE)
2781             cur_poc= s->current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
2782         else
2783             cur_poc= s->current_picture_ptr->poc;
2784
2785         for(list= 0; list<2; list++){
2786             len= add_sorted(sorted    , h->short_ref, h->short_ref_count, cur_poc, 1^list);
2787             len+=add_sorted(sorted+len, h->short_ref, h->short_ref_count, cur_poc, 0^list);
2788             assert(len<=32);
2789             len= build_def_list(h->default_ref_list[list]    , sorted     , len, 0, s->picture_structure);
2790             len+=build_def_list(h->default_ref_list[list]+len, h->long_ref, 16 , 1, s->picture_structure);
2791             assert(len<=32);
2792
2793             if(len < h->ref_count[list])
2794                 memset(&h->default_ref_list[list][len], 0, sizeof(Picture)*(h->ref_count[list] - len));
2795             lens[list]= len;
2796         }
2797
2798         if(lens[0] == lens[1] && lens[1] > 1){
2799             for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0] && i<lens[0]; i++);
2800             if(i == lens[0])
2801                 FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2802         }
2803     }else{
2804         len = build_def_list(h->default_ref_list[0]    , h->short_ref, h->short_ref_count, 0, s->picture_structure);
2805         len+= build_def_list(h->default_ref_list[0]+len, h-> long_ref, 16                , 1, s->picture_structure);
2806         assert(len <= 32);
2807         if(len < h->ref_count[0])
2808             memset(&h->default_ref_list[0][len], 0, sizeof(Picture)*(h->ref_count[0] - len));
2809     }
2810 #ifdef TRACE
2811     for (i=0; i<h->ref_count[0]; i++) {
2812         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2813     }
2814     if(h->slice_type_nos==FF_B_TYPE){
2815         for (i=0; i<h->ref_count[1]; i++) {
2816             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
2817         }
2818     }
2819 #endif
2820     return 0;
2821 }
2822
2823 static void print_short_term(H264Context *h);
2824 static void print_long_term(H264Context *h);
2825
2826 /**
2827  * Extract structure information about the picture described by pic_num in
2828  * the current decoding context (frame or field). Note that pic_num is
2829  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
2830  * @param pic_num picture number for which to extract structure information
2831  * @param structure one of PICT_XXX describing structure of picture
2832  *                      with pic_num
2833  * @return frame number (short term) or long term index of picture
2834  *         described by pic_num
2835  */
2836 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
2837     MpegEncContext * const s = &h->s;
2838
2839     *structure = s->picture_structure;
2840     if(FIELD_PICTURE){
2841         if (!(pic_num & 1))
2842             /* opposite field */
2843             *structure ^= PICT_FRAME;
2844         pic_num >>= 1;
2845     }
2846
2847     return pic_num;
2848 }
2849
2850 static int decode_ref_pic_list_reordering(H264Context *h){
2851     MpegEncContext * const s = &h->s;
2852     int list, index, pic_structure;
2853
2854     print_short_term(h);
2855     print_long_term(h);
2856
2857     for(list=0; list<h->list_count; list++){
2858         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
2859
2860         if(get_bits1(&s->gb)){
2861             int pred= h->curr_pic_num;
2862
2863             for(index=0; ; index++){
2864                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb_31(&s->gb);
2865                 unsigned int pic_id;
2866                 int i;
2867                 Picture *ref = NULL;
2868
2869                 if(reordering_of_pic_nums_idc==3)
2870                     break;
2871
2872                 if(index >= h->ref_count[list]){
2873                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
2874                     return -1;
2875                 }
2876
2877                 if(reordering_of_pic_nums_idc<3){
2878                     if(reordering_of_pic_nums_idc<2){
2879                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
2880                         int frame_num;
2881
2882                         if(abs_diff_pic_num > h->max_pic_num){
2883                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
2884                             return -1;
2885                         }
2886
2887                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
2888                         else                                pred+= abs_diff_pic_num;
2889                         pred &= h->max_pic_num - 1;
2890
2891                         frame_num = pic_num_extract(h, pred, &pic_structure);
2892
2893                         for(i= h->short_ref_count-1; i>=0; i--){
2894                             ref = h->short_ref[i];
2895                             assert(ref->reference);
2896                             assert(!ref->long_ref);
2897                             if(
2898                                    ref->frame_num == frame_num &&
2899                                    (ref->reference & pic_structure)
2900                               )
2901                                 break;
2902                         }
2903                         if(i>=0)
2904                             ref->pic_id= pred;
2905                     }else{
2906                         int long_idx;
2907                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
2908
2909                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
2910
2911                         if(long_idx>31){
2912                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
2913                             return -1;
2914                         }
2915                         ref = h->long_ref[long_idx];
2916                         assert(!(ref && !ref->reference));
2917                         if(ref && (ref->reference & pic_structure)){
2918                             ref->pic_id= pic_id;
2919                             assert(ref->long_ref);
2920                             i=0;
2921                         }else{
2922                             i=-1;
2923                         }
2924                     }
2925
2926                     if (i < 0) {
2927                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
2928                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
2929                     } else {
2930                         for(i=index; i+1<h->ref_count[list]; i++){
2931                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
2932                                 break;
2933                         }
2934                         for(; i > index; i--){
2935                             h->ref_list[list][i]= h->ref_list[list][i-1];
2936                         }
2937                         h->ref_list[list][index]= *ref;
2938                         if (FIELD_PICTURE){
2939                             pic_as_field(&h->ref_list[list][index], pic_structure);
2940                         }
2941                     }
2942                 }else{
2943                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
2944                     return -1;
2945                 }
2946             }
2947         }
2948     }
2949     for(list=0; list<h->list_count; list++){
2950         for(index= 0; index < h->ref_count[list]; index++){
2951             if(!h->ref_list[list][index].data[0]){
2952                 av_log(h->s.avctx, AV_LOG_ERROR, "Missing reference picture\n");
2953                 h->ref_list[list][index]= s->current_picture; //FIXME this is not a sensible solution
2954             }
2955         }
2956     }
2957
2958     return 0;
2959 }
2960
2961 static void fill_mbaff_ref_list(H264Context *h){
2962     int list, i, j;
2963     for(list=0; list<2; list++){ //FIXME try list_count
2964         for(i=0; i<h->ref_count[list]; i++){
2965             Picture *frame = &h->ref_list[list][i];
2966             Picture *field = &h->ref_list[list][16+2*i];
2967             field[0] = *frame;
2968             for(j=0; j<3; j++)
2969                 field[0].linesize[j] <<= 1;
2970             field[0].reference = PICT_TOP_FIELD;
2971             field[0].poc= field[0].field_poc[0];
2972             field[1] = field[0];
2973             for(j=0; j<3; j++)
2974                 field[1].data[j] += frame->linesize[j];
2975             field[1].reference = PICT_BOTTOM_FIELD;
2976             field[1].poc= field[1].field_poc[1];
2977
2978             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
2979             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
2980             for(j=0; j<2; j++){
2981                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
2982                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
2983             }
2984         }
2985     }
2986     for(j=0; j<h->ref_count[1]; j++){
2987         for(i=0; i<h->ref_count[0]; i++)
2988             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
2989         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
2990         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
2991     }
2992 }
2993
2994 static int pred_weight_table(H264Context *h){
2995     MpegEncContext * const s = &h->s;
2996     int list, i;
2997     int luma_def, chroma_def;
2998
2999     h->use_weight= 0;
3000     h->use_weight_chroma= 0;
3001     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
3002     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
3003     luma_def = 1<<h->luma_log2_weight_denom;
3004     chroma_def = 1<<h->chroma_log2_weight_denom;
3005
3006     for(list=0; list<2; list++){
3007         h->luma_weight_flag[list]   = 0;
3008         h->chroma_weight_flag[list] = 0;
3009         for(i=0; i<h->ref_count[list]; i++){
3010             int luma_weight_flag, chroma_weight_flag;
3011
3012             luma_weight_flag= get_bits1(&s->gb);
3013             if(luma_weight_flag){
3014                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3015                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3016                 if(   h->luma_weight[list][i] != luma_def
3017                    || h->luma_offset[list][i] != 0) {
3018                     h->use_weight= 1;
3019                     h->luma_weight_flag[list]= 1;
3020                 }
3021             }else{
3022                 h->luma_weight[list][i]= luma_def;
3023                 h->luma_offset[list][i]= 0;
3024             }
3025
3026             if(CHROMA){
3027                 chroma_weight_flag= get_bits1(&s->gb);
3028                 if(chroma_weight_flag){
3029                     int j;
3030                     for(j=0; j<2; j++){
3031                         h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3032                         h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3033                         if(   h->chroma_weight[list][i][j] != chroma_def
3034                            || h->chroma_offset[list][i][j] != 0) {
3035                             h->use_weight_chroma= 1;
3036                             h->chroma_weight_flag[list]= 1;
3037                         }
3038                     }
3039                 }else{
3040                     int j;
3041                     for(j=0; j<2; j++){
3042                         h->chroma_weight[list][i][j]= chroma_def;
3043                         h->chroma_offset[list][i][j]= 0;
3044                     }
3045                 }
3046             }
3047         }
3048         if(h->slice_type_nos != FF_B_TYPE) break;
3049     }
3050     h->use_weight= h->use_weight || h->use_weight_chroma;
3051     return 0;
3052 }
3053
3054 static void implicit_weight_table(H264Context *h){
3055     MpegEncContext * const s = &h->s;
3056     int ref0, ref1, i;
3057     int cur_poc = s->current_picture_ptr->poc;
3058
3059     for (i = 0; i < 2; i++) {
3060         h->luma_weight_flag[i]   = 0;
3061         h->chroma_weight_flag[i] = 0;
3062     }
3063
3064     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3065        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3066         h->use_weight= 0;
3067         h->use_weight_chroma= 0;
3068         return;
3069     }
3070
3071     h->use_weight= 2;
3072     h->use_weight_chroma= 2;
3073     h->luma_log2_weight_denom= 5;
3074     h->chroma_log2_weight_denom= 5;
3075
3076     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3077         int poc0 = h->ref_list[0][ref0].poc;
3078         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3079             int poc1 = h->ref_list[1][ref1].poc;
3080             int td = av_clip(poc1 - poc0, -128, 127);
3081             if(td){
3082                 int tb = av_clip(cur_poc - poc0, -128, 127);
3083                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3084                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3085                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3086                     h->implicit_weight[ref0][ref1] = 32;
3087                 else
3088                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3089             }else
3090                 h->implicit_weight[ref0][ref1] = 32;
3091         }
3092     }
3093 }
3094
3095 /**
3096  * Mark a picture as no longer needed for reference. The refmask
3097  * argument allows unreferencing of individual fields or the whole frame.
3098  * If the picture becomes entirely unreferenced, but is being held for
3099  * display purposes, it is marked as such.
3100  * @param refmask mask of fields to unreference; the mask is bitwise
3101  *                anded with the reference marking of pic
3102  * @return non-zero if pic becomes entirely unreferenced (except possibly
3103  *         for display purposes) zero if one of the fields remains in
3104  *         reference
3105  */
3106 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3107     int i;
3108     if (pic->reference &= refmask) {
3109         return 0;
3110     } else {
3111         for(i = 0; h->delayed_pic[i]; i++)
3112             if(pic == h->delayed_pic[i]){
3113                 pic->reference=DELAYED_PIC_REF;
3114                 break;
3115             }
3116         return 1;
3117     }
3118 }
3119
3120 /**
3121  * instantaneous decoder refresh.
3122  */
3123 static void idr(H264Context *h){
3124     int i;
3125
3126     for(i=0; i<16; i++){
3127         remove_long(h, i, 0);
3128     }
3129     assert(h->long_ref_count==0);
3130
3131     for(i=0; i<h->short_ref_count; i++){
3132         unreference_pic(h, h->short_ref[i], 0);
3133         h->short_ref[i]= NULL;
3134     }
3135     h->short_ref_count=0;
3136     h->prev_frame_num= 0;
3137     h->prev_frame_num_offset= 0;
3138     h->prev_poc_msb=
3139     h->prev_poc_lsb= 0;
3140 }
3141
3142 /* forget old pics after a seek */
3143 static void flush_dpb(AVCodecContext *avctx){
3144     H264Context *h= avctx->priv_data;
3145     int i;
3146     for(i=0; i<MAX_DELAYED_PIC_COUNT; i++) {
3147         if(h->delayed_pic[i])
3148             h->delayed_pic[i]->reference= 0;
3149         h->delayed_pic[i]= NULL;
3150     }
3151     h->outputed_poc= INT_MIN;
3152     idr(h);
3153     if(h->s.current_picture_ptr)
3154         h->s.current_picture_ptr->reference= 0;
3155     h->s.first_field= 0;
3156     reset_sei(h);
3157     ff_mpeg_flush(avctx);
3158 }
3159
3160 /**
3161  * Find a Picture in the short term reference list by frame number.
3162  * @param frame_num frame number to search for
3163  * @param idx the index into h->short_ref where returned picture is found
3164  *            undefined if no picture found.
3165  * @return pointer to the found picture, or NULL if no pic with the provided
3166  *                 frame number is found
3167  */
3168 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3169     MpegEncContext * const s = &h->s;
3170     int i;
3171
3172     for(i=0; i<h->short_ref_count; i++){
3173         Picture *pic= h->short_ref[i];
3174         if(s->avctx->debug&FF_DEBUG_MMCO)
3175             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3176         if(pic->frame_num == frame_num) {
3177             *idx = i;
3178             return pic;
3179         }
3180     }
3181     return NULL;
3182 }
3183
3184 /**
3185  * Remove a picture from the short term reference list by its index in
3186  * that list.  This does no checking on the provided index; it is assumed
3187  * to be valid. Other list entries are shifted down.
3188  * @param i index into h->short_ref of picture to remove.
3189  */
3190 static void remove_short_at_index(H264Context *h, int i){
3191     assert(i >= 0 && i < h->short_ref_count);
3192     h->short_ref[i]= NULL;
3193     if (--h->short_ref_count)
3194         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3195 }
3196
3197 /**
3198  *
3199  * @return the removed picture or NULL if an error occurs
3200  */
3201 static Picture * remove_short(H264Context *h, int frame_num, int ref_mask){
3202     MpegEncContext * const s = &h->s;
3203     Picture *pic;
3204     int i;
3205
3206     if(s->avctx->debug&FF_DEBUG_MMCO)
3207         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3208
3209     pic = find_short(h, frame_num, &i);
3210     if (pic){
3211         if(unreference_pic(h, pic, ref_mask))
3212         remove_short_at_index(h, i);
3213     }
3214
3215     return pic;
3216 }
3217
3218 /**
3219  * Remove a picture from the long term reference list by its index in
3220  * that list.
3221  * @return the removed picture or NULL if an error occurs
3222  */
3223 static Picture * remove_long(H264Context *h, int i, int ref_mask){
3224     Picture *pic;
3225
3226     pic= h->long_ref[i];
3227     if (pic){
3228         if(unreference_pic(h, pic, ref_mask)){
3229             assert(h->long_ref[i]->long_ref == 1);
3230             h->long_ref[i]->long_ref= 0;
3231             h->long_ref[i]= NULL;
3232             h->long_ref_count--;
3233         }
3234     }
3235
3236     return pic;
3237 }
3238
3239 /**
3240  * print short term list
3241  */
3242 static void print_short_term(H264Context *h) {
3243     uint32_t i;
3244     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3245         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3246         for(i=0; i<h->short_ref_count; i++){
3247             Picture *pic= h->short_ref[i];
3248             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3249         }
3250     }
3251 }
3252
3253 /**
3254  * print long term list
3255  */
3256 static void print_long_term(H264Context *h) {
3257     uint32_t i;
3258     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3259         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3260         for(i = 0; i < 16; i++){
3261             Picture *pic= h->long_ref[i];
3262             if (pic) {
3263                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3264             }
3265         }
3266     }
3267 }
3268
3269 /**
3270  * Executes the reference picture marking (memory management control operations).
3271  */
3272 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3273     MpegEncContext * const s = &h->s;
3274     int i, av_uninit(j);
3275     int current_ref_assigned=0;
3276     Picture *av_uninit(pic);
3277
3278     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3279         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3280
3281     for(i=0; i<mmco_count; i++){
3282         int av_uninit(structure), av_uninit(frame_num);
3283         if(s->avctx->debug&FF_DEBUG_MMCO)
3284             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3285
3286         if(   mmco[i].opcode == MMCO_SHORT2UNUSED
3287            || mmco[i].opcode == MMCO_SHORT2LONG){
3288             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3289             pic = find_short(h, frame_num, &j);
3290             if(!pic){
3291                 if(mmco[i].opcode != MMCO_SHORT2LONG || !h->long_ref[mmco[i].long_arg]
3292                    || h->long_ref[mmco[i].long_arg]->frame_num != frame_num)
3293                 av_log(h->s.avctx, AV_LOG_ERROR, "mmco: unref short failure\n");
3294                 continue;
3295             }
3296         }
3297
3298         switch(mmco[i].opcode){
3299         case MMCO_SHORT2UNUSED:
3300             if(s->avctx->debug&FF_DEBUG_MMCO)
3301                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3302             remove_short(h, frame_num, structure ^ PICT_FRAME);
3303             break;
3304         case MMCO_SHORT2LONG:
3305                 if (h->long_ref[mmco[i].long_arg] != pic)
3306                     remove_long(h, mmco[i].long_arg, 0);
3307
3308                 remove_short_at_index(h, j);
3309                 h->long_ref[ mmco[i].long_arg ]= pic;
3310                 if (h->long_ref[ mmco[i].long_arg ]){
3311                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3312                     h->long_ref_count++;
3313                 }
3314             break;
3315         case MMCO_LONG2UNUSED:
3316             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3317             pic = h->long_ref[j];
3318             if (pic) {
3319                 remove_long(h, j, structure ^ PICT_FRAME);
3320             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3321                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3322             break;
3323         case MMCO_LONG:
3324                     // Comment below left from previous code as it is an interresting note.
3325                     /* First field in pair is in short term list or
3326                      * at a different long term index.
3327                      * This is not allowed; see 7.4.3.3, notes 2 and 3.
3328                      * Report the problem and keep the pair where it is,
3329                      * and mark this field valid.
3330                      */
3331
3332             if (h->long_ref[mmco[i].long_arg] != s->current_picture_ptr) {
3333                 remove_long(h, mmco[i].long_arg, 0);
3334
3335                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3336                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3337                 h->long_ref_count++;
3338             }
3339
3340             s->current_picture_ptr->reference |= s->picture_structure;
3341             current_ref_assigned=1;
3342             break;
3343         case MMCO_SET_MAX_LONG:
3344             assert(mmco[i].long_arg <= 16);
3345             // just remove the long term which index is greater than new max
3346             for(j = mmco[i].long_arg; j<16; j++){
3347                 remove_long(h, j, 0);
3348             }
3349             break;
3350         case MMCO_RESET:
3351             while(h->short_ref_count){
3352                 remove_short(h, h->short_ref[0]->frame_num, 0);
3353             }
3354             for(j = 0; j < 16; j++) {
3355                 remove_long(h, j, 0);
3356             }
3357             s->current_picture_ptr->poc=
3358             s->current_picture_ptr->field_poc[0]=
3359             s->current_picture_ptr->field_poc[1]=
3360             h->poc_lsb=
3361             h->poc_msb=
3362             h->frame_num=
3363             s->current_picture_ptr->frame_num= 0;
3364             break;
3365         default: assert(0);
3366         }
3367     }
3368
3369     if (!current_ref_assigned) {
3370         /* Second field of complementary field pair; the first field of
3371          * which is already referenced. If short referenced, it
3372          * should be first entry in short_ref. If not, it must exist
3373          * in long_ref; trying to put it on the short list here is an
3374          * error in the encoded bit stream (ref: 7.4.3.3, NOTE 2 and 3).
3375          */
3376         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3377             /* Just mark the second field valid */
3378             s->current_picture_ptr->reference = PICT_FRAME;
3379         } else if (s->current_picture_ptr->long_ref) {
3380             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3381                                              "assignment for second field "
3382                                              "in complementary field pair "
3383                                              "(first field is long term)\n");
3384         } else {
3385             pic= remove_short(h, s->current_picture_ptr->frame_num, 0);
3386             if(pic){
3387                 av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3388             }
3389
3390             if(h->short_ref_count)
3391                 memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3392
3393             h->short_ref[0]= s->current_picture_ptr;
3394             h->short_ref_count++;
3395             s->current_picture_ptr->reference |= s->picture_structure;
3396         }
3397     }
3398
3399     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3400
3401         /* We have too many reference frames, probably due to corrupted
3402          * stream. Need to discard one frame. Prevents overrun of the
3403          * short_ref and long_ref buffers.
3404          */
3405         av_log(h->s.avctx, AV_LOG_ERROR,
3406                "number of reference frames exceeds max (probably "
3407                "corrupt input), discarding one\n");
3408
3409         if (h->long_ref_count && !h->short_ref_count) {
3410             for (i = 0; i < 16; ++i)
3411                 if (h->long_ref[i])
3412                     break;
3413
3414             assert(i < 16);
3415             remove_long(h, i, 0);
3416         } else {
3417             pic = h->short_ref[h->short_ref_count - 1];
3418             remove_short(h, pic->frame_num, 0);
3419         }
3420     }
3421
3422     print_short_term(h);
3423     print_long_term(h);
3424     return 0;
3425 }
3426
3427 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3428     MpegEncContext * const s = &h->s;
3429     int i;
3430
3431     h->mmco_index= 0;
3432     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3433         s->broken_link= get_bits1(gb) -1;
3434         if(get_bits1(gb)){
3435             h->mmco[0].opcode= MMCO_LONG;
3436             h->mmco[0].long_arg= 0;
3437             h->mmco_index= 1;
3438         }
3439     }else{
3440         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3441             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3442                 MMCOOpcode opcode= get_ue_golomb_31(gb);
3443
3444                 h->mmco[i].opcode= opcode;
3445                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3446                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3447 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3448                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3449                         return -1;
3450                     }*/
3451                 }
3452                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3453                     unsigned int long_arg= get_ue_golomb_31(gb);
3454                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3455                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3456                         return -1;
3457                     }
3458                     h->mmco[i].long_arg= long_arg;
3459                 }
3460
3461                 if(opcode > (unsigned)MMCO_LONG){
3462                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3463                     return -1;
3464                 }
3465                 if(opcode == MMCO_END)
3466                     break;
3467             }
3468             h->mmco_index= i;
3469         }else{
3470             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3471
3472             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3473                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3474                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3475                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3476                 h->mmco_index= 1;
3477                 if (FIELD_PICTURE) {
3478                     h->mmco[0].short_pic_num *= 2;
3479                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3480                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3481                     h->mmco_index= 2;
3482                 }
3483             }
3484         }
3485     }
3486
3487     return 0;
3488 }
3489
3490 static int init_poc(H264Context *h){
3491     MpegEncContext * const s = &h->s;
3492     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3493     int field_poc[2];
3494     Picture *cur = s->current_picture_ptr;
3495
3496     h->frame_num_offset= h->prev_frame_num_offset;
3497     if(h->frame_num < h->prev_frame_num)
3498         h->frame_num_offset += max_frame_num;
3499
3500     if(h->sps.poc_type==0){
3501         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3502
3503         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3504             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3505         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3506             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3507         else
3508             h->poc_msb = h->prev_poc_msb;
3509 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3510         field_poc[0] =
3511         field_poc[1] = h->poc_msb + h->poc_lsb;
3512         if(s->picture_structure == PICT_FRAME)
3513             field_poc[1] += h->delta_poc_bottom;
3514     }else if(h->sps.poc_type==1){
3515         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3516         int i;
3517
3518         if(h->sps.poc_cycle_length != 0)
3519             abs_frame_num = h->frame_num_offset + h->frame_num;
3520         else
3521             abs_frame_num = 0;
3522
3523         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3524             abs_frame_num--;
3525
3526         expected_delta_per_poc_cycle = 0;
3527         for(i=0; i < h->sps.poc_cycle_length; i++)
3528             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3529
3530         if(abs_frame_num > 0){
3531             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3532             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3533
3534             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3535             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3536                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3537         } else
3538             expectedpoc = 0;
3539
3540         if(h->nal_ref_idc == 0)
3541             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3542
3543         field_poc[0] = expectedpoc + h->delta_poc[0];
3544         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3545
3546         if(s->picture_structure == PICT_FRAME)
3547             field_poc[1] += h->delta_poc[1];
3548     }else{
3549         int poc= 2*(h->frame_num_offset + h->frame_num);
3550
3551         if(!h->nal_ref_idc)
3552             poc--;
3553
3554         field_poc[0]= poc;
3555         field_poc[1]= poc;
3556     }
3557
3558     if(s->picture_structure != PICT_BOTTOM_FIELD)
3559         s->current_picture_ptr->field_poc[0]= field_poc[0];
3560     if(s->picture_structure != PICT_TOP_FIELD)
3561         s->current_picture_ptr->field_poc[1]= field_poc[1];
3562     cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3563
3564     return 0;
3565 }
3566
3567
3568 /**
3569  * initialize scan tables
3570  */
3571 static void init_scan_tables(H264Context *h){
3572     MpegEncContext * const s = &h->s;
3573     int i;
3574     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3575         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3576         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3577     }else{
3578         for(i=0; i<16; i++){
3579 #define T(x) (x>>2) | ((x<<2) & 0xF)
3580             h->zigzag_scan[i] = T(zigzag_scan[i]);
3581             h-> field_scan[i] = T( field_scan[i]);
3582 #undef T
3583         }
3584     }
3585     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3586         memcpy(h->zigzag_scan8x8,       ff_zigzag_direct,     64*sizeof(uint8_t));
3587         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3588         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3589         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3590     }else{
3591         for(i=0; i<64; i++){
3592 #define T(x) (x>>3) | ((x&7)<<3)
3593             h->zigzag_scan8x8[i]       = T(ff_zigzag_direct[i]);
3594             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3595             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3596             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3597 #undef T
3598         }
3599     }
3600     if(h->sps.transform_bypass){ //FIXME same ugly
3601         h->zigzag_scan_q0          = zigzag_scan;
3602         h->zigzag_scan8x8_q0       = ff_zigzag_direct;
3603         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3604         h->field_scan_q0           = field_scan;
3605         h->field_scan8x8_q0        = field_scan8x8;
3606         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3607     }else{
3608         h->zigzag_scan_q0          = h->zigzag_scan;
3609         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3610         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3611         h->field_scan_q0           = h->field_scan;
3612         h->field_scan8x8_q0        = h->field_scan8x8;
3613         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3614     }
3615 }
3616
3617 static void field_end(H264Context *h){
3618     MpegEncContext * const s = &h->s;
3619     AVCodecContext * const avctx= s->avctx;
3620     s->mb_y= 0;
3621
3622     s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
3623     s->current_picture_ptr->pict_type= s->pict_type;
3624
3625     if (CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
3626         ff_vdpau_h264_set_reference_frames(s);
3627
3628     if(!s->dropable) {
3629         execute_ref_pic_marking(h, h->mmco, h->mmco_index);
3630         h->prev_poc_msb= h->poc_msb;
3631         h->prev_poc_lsb= h->poc_lsb;
3632     }
3633     h->prev_frame_num_offset= h->frame_num_offset;
3634     h->prev_frame_num= h->frame_num;
3635
3636     if (avctx->hwaccel) {
3637         if (avctx->hwaccel->end_frame(avctx) < 0)
3638             av_log(avctx, AV_LOG_ERROR, "hardware accelerator failed to decode picture\n");
3639     }
3640
3641     if (CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
3642         ff_vdpau_h264_picture_complete(s);
3643
3644     /*
3645      * FIXME: Error handling code does not seem to support interlaced
3646      * when slices span multiple rows
3647      * The ff_er_add_slice calls don't work right for bottom
3648      * fields; they cause massive erroneous error concealing
3649      * Error marking covers both fields (top and bottom).
3650      * This causes a mismatched s->error_count
3651      * and a bad error table. Further, the error count goes to
3652      * INT_MAX when called for bottom field, because mb_y is
3653      * past end by one (callers fault) and resync_mb_y != 0
3654      * causes problems for the first MB line, too.
3655      */
3656     if (!FIELD_PICTURE)
3657         ff_er_frame_end(s);
3658
3659     MPV_frame_end(s);
3660 }
3661
3662 /**
3663  * Replicates H264 "master" context to thread contexts.
3664  */
3665 static void clone_slice(H264Context *dst, H264Context *src)
3666 {
3667     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3668     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3669     dst->s.current_picture      = src->s.current_picture;
3670     dst->s.linesize             = src->s.linesize;
3671     dst->s.uvlinesize           = src->s.uvlinesize;
3672     dst->s.first_field          = src->s.first_field;
3673
3674     dst->prev_poc_msb           = src->prev_poc_msb;
3675     dst->prev_poc_lsb           = src->prev_poc_lsb;
3676     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3677     dst->prev_frame_num         = src->prev_frame_num;
3678     dst->short_ref_count        = src->short_ref_count;
3679
3680     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3681     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3682     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3683     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3684
3685     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3686     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3687 }
3688
3689 /**
3690  * decodes a slice header.
3691  * This will also call MPV_common_init() and frame_start() as needed.
3692  *
3693  * @param h h264context
3694  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3695  *
3696  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
3697  */
3698 static int decode_slice_header(H264Context *h, H264Context *h0){
3699     MpegEncContext * const s = &h->s;
3700     MpegEncContext * const s0 = &h0->s;
3701     unsigned int first_mb_in_slice;
3702     unsigned int pps_id;
3703     int num_ref_idx_active_override_flag;
3704     unsigned int slice_type, tmp, i, j;
3705     int default_ref_list_done = 0;
3706     int last_pic_structure;
3707
3708     s->dropable= h->nal_ref_idc == 0;
3709
3710     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3711         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3712         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3713     }else{
3714         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3715         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3716     }
3717
3718     first_mb_in_slice= get_ue_golomb(&s->gb);
3719
3720     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3721         h0->current_slice = 0;
3722         if (!s0->first_field)
3723             s->current_picture_ptr= NULL;
3724     }
3725
3726     slice_type= get_ue_golomb_31(&s->gb);
3727     if(slice_type > 9){
3728         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3729         return -1;
3730     }
3731     if(slice_type > 4){
3732         slice_type -= 5;
3733         h->slice_type_fixed=1;
3734     }else
3735         h->slice_type_fixed=0;
3736
3737     slice_type= golomb_to_pict_type[ slice_type ];
3738     if (slice_type == FF_I_TYPE
3739         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3740         default_ref_list_done = 1;
3741     }
3742     h->slice_type= slice_type;
3743     h->slice_type_nos= slice_type & 3;
3744
3745     s->pict_type= h->slice_type; // to make a few old functions happy, it's wrong though
3746     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3747         av_log(h->s.avctx, AV_LOG_ERROR,
3748                "B picture before any references, skipping\n");
3749         return -1;
3750     }
3751
3752     pps_id= get_ue_golomb(&s->gb);
3753     if(pps_id>=MAX_PPS_COUNT){
3754         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3755         return -1;
3756     }
3757     if(!h0->pps_buffers[pps_id]) {
3758         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing PPS %u referenced\n", pps_id);
3759         return -1;
3760     }
3761     h->pps= *h0->pps_buffers[pps_id];
3762
3763     if(!h0->sps_buffers[h->pps.sps_id]) {
3764         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS %u referenced\n", h->pps.sps_id);
3765         return -1;
3766     }
3767     h->sps = *h0->sps_buffers[h->pps.sps_id];
3768
3769     if(h == h0 && h->dequant_coeff_pps != pps_id){
3770         h->dequant_coeff_pps = pps_id;
3771         init_dequant_tables(h);
3772     }
3773
3774     s->mb_width= h->sps.mb_width;
3775     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3776
3777     h->b_stride=  s->mb_width*4;
3778     h->b8_stride= s->mb_width*2;
3779
3780     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
3781     if(h->sps.frame_mbs_only_flag)
3782         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
3783     else
3784         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 3);
3785
3786     if (s->context_initialized
3787         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3788         if(h != h0)
3789             return -1;   // width / height changed during parallelized decoding
3790         free_tables(h);
3791         flush_dpb(s->avctx);
3792         MPV_common_end(s);
3793     }
3794     if (!s->context_initialized) {
3795         if(h != h0)
3796             return -1;  // we cant (re-)initialize context during parallel decoding
3797         if (MPV_common_init(s) < 0)
3798             return -1;
3799         s->first_field = 0;
3800
3801         init_scan_tables(h);
3802         alloc_tables(h);
3803
3804         for(i = 1; i < s->avctx->thread_count; i++) {
3805             H264Context *c;
3806             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3807             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3808             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3809             c->sps = h->sps;
3810             c->pps = h->pps;
3811             init_scan_tables(c);
3812             clone_tables(c, h);
3813         }
3814
3815         for(i = 0; i < s->avctx->thread_count; i++)
3816             if(context_init(h->thread_context[i]) < 0)
3817                 return -1;
3818
3819         s->avctx->width = s->width;
3820         s->avctx->height = s->height;
3821         s->avctx->sample_aspect_ratio= h->sps.sar;
3822         if(!s->avctx->sample_aspect_ratio.den)
3823             s->avctx->sample_aspect_ratio.den = 1;
3824
3825         if(h->sps.timing_info_present_flag){
3826             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick, h->sps.time_scale};
3827             if(h->x264_build > 0 && h->x264_build < 44)
3828                 s->avctx->time_base.den *= 2;
3829             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3830                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3831         }
3832     }
3833
3834     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3835
3836     h->mb_mbaff = 0;
3837     h->mb_aff_frame = 0;
3838     last_pic_structure = s0->picture_structure;
3839     if(h->sps.frame_mbs_only_flag){
3840         s->picture_structure= PICT_FRAME;
3841     }else{
3842         if(get_bits1(&s->gb)) { //field_pic_flag
3843             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3844         } else {
3845             s->picture_structure= PICT_FRAME;
3846             h->mb_aff_frame = h->sps.mb_aff;
3847         }
3848     }
3849     h->mb_field_decoding_flag= s->picture_structure != PICT_FRAME;
3850
3851     if(h0->current_slice == 0){
3852         while(h->frame_num !=  h->prev_frame_num &&
3853               h->frame_num != (h->prev_frame_num+1)%(1<<h->sps.log2_max_frame_num)){
3854             av_log(NULL, AV_LOG_DEBUG, "Frame num gap %d %d\n", h->frame_num, h->prev_frame_num);
3855             if (frame_start(h) < 0)
3856                 return -1;
3857             h->prev_frame_num++;
3858             h->prev_frame_num %= 1<<h->sps.log2_max_frame_num;
3859             s->current_picture_ptr->frame_num= h->prev_frame_num;
3860             execute_ref_pic_marking(h, NULL, 0);
3861         }
3862
3863         /* See if we have a decoded first field looking for a pair... */
3864         if (s0->first_field) {
3865             assert(s0->current_picture_ptr);
3866             assert(s0->current_picture_ptr->data[0]);
3867             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
3868
3869             /* figure out if we have a complementary field pair */
3870             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
3871                 /*
3872                  * Previous field is unmatched. Don't display it, but let it
3873                  * remain for reference if marked as such.
3874                  */
3875                 s0->current_picture_ptr = NULL;
3876                 s0->first_field = FIELD_PICTURE;
3877
3878             } else {
3879                 if (h->nal_ref_idc &&
3880                         s0->current_picture_ptr->reference &&
3881                         s0->current_picture_ptr->frame_num != h->frame_num) {
3882                     /*
3883                      * This and previous field were reference, but had
3884                      * different frame_nums. Consider this field first in
3885                      * pair. Throw away previous field except for reference
3886                      * purposes.
3887                      */
3888                     s0->first_field = 1;
3889                     s0->current_picture_ptr = NULL;
3890
3891                 } else {
3892                     /* Second field in complementary pair */
3893                     s0->first_field = 0;
3894                 }
3895             }
3896
3897         } else {
3898             /* Frame or first field in a potentially complementary pair */
3899             assert(!s0->current_picture_ptr);
3900             s0->first_field = FIELD_PICTURE;
3901         }
3902
3903         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
3904             s0->first_field = 0;
3905             return -1;
3906         }
3907     }
3908     if(h != h0)
3909         clone_slice(h, h0);
3910
3911     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
3912
3913     assert(s->mb_num == s->mb_width * s->mb_height);
3914     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
3915        first_mb_in_slice                    >= s->mb_num){
3916         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
3917         return -1;
3918     }
3919     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
3920     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
3921     if (s->picture_structure == PICT_BOTTOM_FIELD)
3922         s->resync_mb_y = s->mb_y = s->mb_y + 1;
3923     assert(s->mb_y < s->mb_height);
3924
3925     if(s->picture_structure==PICT_FRAME){
3926         h->curr_pic_num=   h->frame_num;
3927         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
3928     }else{
3929         h->curr_pic_num= 2*h->frame_num + 1;
3930         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
3931     }
3932
3933     if(h->nal_unit_type == NAL_IDR_SLICE){
3934         get_ue_golomb(&s->gb); /* idr_pic_id */
3935     }
3936
3937     if(h->sps.poc_type==0){
3938         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
3939
3940         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
3941             h->delta_poc_bottom= get_se_golomb(&s->gb);
3942         }
3943     }
3944
3945     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
3946         h->delta_poc[0]= get_se_golomb(&s->gb);
3947
3948         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
3949             h->delta_poc[1]= get_se_golomb(&s->gb);
3950     }
3951
3952     init_poc(h);
3953
3954     if(h->pps.redundant_pic_cnt_present){
3955         h->redundant_pic_count= get_ue_golomb(&s->gb);
3956     }
3957
3958     //set defaults, might be overridden a few lines later
3959     h->ref_count[0]= h->pps.ref_count[0];
3960     h->ref_count[1]= h->pps.ref_count[1];
3961
3962     if(h->slice_type_nos != FF_I_TYPE){
3963         if(h->slice_type_nos == FF_B_TYPE){
3964             h->direct_spatial_mv_pred= get_bits1(&s->gb);
3965         }
3966         num_ref_idx_active_override_flag= get_bits1(&s->gb);
3967
3968         if(num_ref_idx_active_override_flag){
3969             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
3970             if(h->slice_type_nos==FF_B_TYPE)
3971                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
3972
3973             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
3974                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
3975                 h->ref_count[0]= h->ref_count[1]= 1;
3976                 return -1;
3977             }
3978         }
3979         if(h->slice_type_nos == FF_B_TYPE)
3980             h->list_count= 2;
3981         else
3982             h->list_count= 1;
3983     }else
3984         h->list_count= 0;
3985
3986     if(!default_ref_list_done){
3987         fill_default_ref_list(h);
3988     }
3989
3990     if(h->slice_type_nos!=FF_I_TYPE && decode_ref_pic_list_reordering(h) < 0)
3991         return -1;
3992
3993     if(h->slice_type_nos!=FF_I_TYPE){
3994         s->last_picture_ptr= &h->ref_list[0][0];
3995         ff_copy_picture(&s->last_picture, s->last_picture_ptr);
3996     }
3997     if(h->slice_type_nos==FF_B_TYPE){
3998         s->next_picture_ptr= &h->ref_list[1][0];
3999         ff_copy_picture(&s->next_picture, s->next_picture_ptr);
4000     }
4001
4002     if(   (h->pps.weighted_pred          && h->slice_type_nos == FF_P_TYPE )
4003        ||  (h->pps.weighted_bipred_idc==1 && h->slice_type_nos== FF_B_TYPE ) )
4004         pred_weight_table(h);
4005     else if(h->pps.weighted_bipred_idc==2 && h->slice_type_nos== FF_B_TYPE)
4006         implicit_weight_table(h);
4007     else {
4008         h->use_weight = 0;
4009         for (i = 0; i < 2; i++) {
4010             h->luma_weight_flag[i]   = 0;
4011             h->chroma_weight_flag[i] = 0;
4012         }
4013     }
4014
4015     if(h->nal_ref_idc)
4016         decode_ref_pic_marking(h0, &s->gb);
4017
4018     if(FRAME_MBAFF)
4019         fill_mbaff_ref_list(h);
4020
4021     if(h->slice_type_nos==FF_B_TYPE && !h->direct_spatial_mv_pred)
4022         direct_dist_scale_factor(h);
4023     direct_ref_list_init(h);
4024
4025     if( h->slice_type_nos != FF_I_TYPE && h->pps.cabac ){
4026         tmp = get_ue_golomb_31(&s->gb);
4027         if(tmp > 2){
4028             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
4029             return -1;
4030         }
4031         h->cabac_init_idc= tmp;
4032     }
4033
4034     h->last_qscale_diff = 0;
4035     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
4036     if(tmp>51){
4037         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
4038         return -1;
4039     }
4040     s->qscale= tmp;
4041     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
4042     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
4043     //FIXME qscale / qp ... stuff
4044     if(h->slice_type == FF_SP_TYPE){
4045         get_bits1(&s->gb); /* sp_for_switch_flag */
4046     }
4047     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
4048         get_se_golomb(&s->gb); /* slice_qs_delta */
4049     }
4050
4051     h->deblocking_filter = 1;
4052     h->slice_alpha_c0_offset = 0;
4053     h->slice_beta_offset = 0;
4054     if( h->pps.deblocking_filter_parameters_present ) {
4055         tmp= get_ue_golomb_31(&s->gb);
4056         if(tmp > 2){
4057             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
4058             return -1;
4059         }
4060         h->deblocking_filter= tmp;
4061         if(h->deblocking_filter < 2)
4062             h->deblocking_filter^= 1; // 1<->0
4063
4064         if( h->deblocking_filter ) {
4065             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4066             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4067         }
4068     }
4069
4070     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4071        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type_nos != FF_I_TYPE)
4072        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type_nos == FF_B_TYPE)
4073        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4074         h->deblocking_filter= 0;
4075
4076     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
4077         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
4078             /* Cheat slightly for speed:
4079                Do not bother to deblock across slices. */
4080             h->deblocking_filter = 2;
4081         } else {
4082             h0->max_contexts = 1;
4083             if(!h0->single_decode_warning) {
4084                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
4085                 h0->single_decode_warning = 1;
4086             }
4087             if(h != h0)
4088                 return 1; // deblocking switched inside frame
4089         }
4090     }
4091
4092 #if 0 //FMO
4093     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4094         slice_group_change_cycle= get_bits(&s->gb, ?);
4095 #endif
4096
4097     h0->last_slice_type = slice_type;
4098     h->slice_num = ++h0->current_slice;
4099     if(h->slice_num >= MAX_SLICES){
4100         av_log(s->avctx, AV_LOG_ERROR, "Too many slices, increase MAX_SLICES and recompile\n");
4101     }
4102
4103     for(j=0; j<2; j++){
4104         int *ref2frm= h->ref2frm[h->slice_num&(MAX_SLICES-1)][j];
4105         ref2frm[0]=
4106         ref2frm[1]= -1;
4107         for(i=0; i<16; i++)
4108             ref2frm[i+2]= 4*h->ref_list[j][i].frame_num
4109                           +(h->ref_list[j][i].reference&3);
4110         ref2frm[18+0]=
4111         ref2frm[18+1]= -1;
4112         for(i=16; i<48; i++)
4113             ref2frm[i+4]= 4*h->ref_list[j][i].frame_num
4114                           +(h->ref_list[j][i].reference&3);
4115     }
4116
4117     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4118     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
4119
4120     s->avctx->refs= h->sps.ref_frame_count;
4121
4122     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4123         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c%s%s pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
4124                h->slice_num,
4125                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4126                first_mb_in_slice,
4127                av_get_pict_type_char(h->slice_type), h->slice_type_fixed ? " fix" : "", h->nal_unit_type == NAL_IDR_SLICE ? " IDR" : "",
4128                pps_id, h->frame_num,
4129                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4130                h->ref_count[0], h->ref_count[1],
4131                s->qscale,
4132                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4133                h->use_weight,
4134                h->use_weight==1 && h->use_weight_chroma ? "c" : "",
4135                h->slice_type == FF_B_TYPE ? (h->direct_spatial_mv_pred ? "SPAT" : "TEMP") : ""
4136                );
4137     }
4138
4139     return 0;
4140 }
4141
4142 /**
4143  *
4144  */
4145 static inline int get_level_prefix(GetBitContext *gb){
4146     unsigned int buf;
4147     int log;
4148
4149     OPEN_READER(re, gb);
4150     UPDATE_CACHE(re, gb);
4151     buf=GET_CACHE(re, gb);
4152
4153     log= 32 - av_log2(buf);
4154 #ifdef TRACE
4155     print_bin(buf>>(32-log), log);
4156     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4157 #endif
4158
4159     LAST_SKIP_BITS(re, gb, log);
4160     CLOSE_READER(re, gb);
4161
4162     return log-1;
4163 }
4164
4165 static inline int get_dct8x8_allowed(H264Context *h){
4166     if(h->sps.direct_8x8_inference_flag)
4167         return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8                )*0x0001000100010001ULL));
4168     else
4169         return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8|MB_TYPE_DIRECT2)*0x0001000100010001ULL));
4170 }
4171
4172 /**
4173  * decodes a residual block.
4174  * @param n block index
4175  * @param scantable scantable
4176  * @param max_coeff number of coefficients in the block
4177  * @return <0 if an error occurred
4178  */
4179 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4180     MpegEncContext * const s = &h->s;
4181     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4182     int level[16];
4183     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4184
4185     //FIXME put trailing_onex into the context
4186
4187     if(n == CHROMA_DC_BLOCK_INDEX){
4188         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4189         total_coeff= coeff_token>>2;
4190     }else{
4191         if(n == LUMA_DC_BLOCK_INDEX){
4192             total_coeff= pred_non_zero_count(h, 0);
4193             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4194             total_coeff= coeff_token>>2;
4195         }else{
4196             total_coeff= pred_non_zero_count(h, n);
4197             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4198             total_coeff= coeff_token>>2;
4199             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4200         }
4201     }
4202
4203     //FIXME set last_non_zero?
4204
4205     if(total_coeff==0)
4206         return 0;
4207     if(total_coeff > (unsigned)max_coeff) {
4208         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4209         return -1;
4210     }
4211
4212     trailing_ones= coeff_token&3;
4213     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4214     assert(total_coeff<=16);
4215
4216     i = show_bits(gb, 3);
4217     skip_bits(gb, trailing_ones);
4218     level[0] = 1-((i&4)>>1);
4219     level[1] = 1-((i&2)   );
4220     level[2] = 1-((i&1)<<1);
4221
4222     if(trailing_ones<total_coeff) {
4223         int mask, prefix;
4224         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4225         int bitsi= show_bits(gb, LEVEL_TAB_BITS);
4226         int level_code= cavlc_level_tab[suffix_length][bitsi][0];
4227
4228         skip_bits(gb, cavlc_level_tab[suffix_length][bitsi][1]);
4229         if(level_code >= 100){
4230             prefix= level_code - 100;
4231             if(prefix == LEVEL_TAB_BITS)
4232                 prefix += get_level_prefix(gb);
4233
4234             //first coefficient has suffix_length equal to 0 or 1
4235             if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4236                 if(suffix_length)
4237                     level_code= (prefix<<1) + get_bits1(gb); //part
4238                 else
4239                     level_code= prefix; //part
4240             }else if(prefix==14){
4241                 if(suffix_length)
4242                     level_code= (prefix<<1) + get_bits1(gb); //part
4243                 else
4244                     level_code= prefix + get_bits(gb, 4); //part
4245             }else{
4246                 level_code= 30 + get_bits(gb, prefix-3); //part
4247                 if(prefix>=16)
4248                     level_code += (1<<(prefix-3))-4096;
4249             }
4250
4251             if(trailing_ones < 3) level_code += 2;
4252
4253             suffix_length = 2;
4254             mask= -(level_code&1);
4255             level[trailing_ones]= (((2+level_code)>>1) ^ mask) - mask;
4256         }else{
4257             if(trailing_ones < 3) level_code += (level_code>>31)|1;
4258
4259             suffix_length = 1;
4260             if(level_code + 3U > 6U)
4261                 suffix_length++;
4262             level[trailing_ones]= level_code;
4263         }
4264
4265         //remaining coefficients have suffix_length > 0
4266         for(i=trailing_ones+1;i<total_coeff;i++) {
4267             static const unsigned int suffix_limit[7] = {0,3,6,12,24,48,INT_MAX };
4268             int bitsi= show_bits(gb, LEVEL_TAB_BITS);
4269             level_code= cavlc_level_tab[suffix_length][bitsi][0];
4270
4271             skip_bits(gb, cavlc_level_tab[suffix_length][bitsi][1]);
4272             if(level_code >= 100){
4273                 prefix= level_code - 100;
4274                 if(prefix == LEVEL_TAB_BITS){
4275                     prefix += get_level_prefix(gb);
4276                 }
4277                 if(prefix<15){
4278                     level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4279                 }else{
4280                     level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
4281                     if(prefix>=16)
4282                         level_code += (1<<(prefix-3))-4096;
4283                 }
4284                 mask= -(level_code&1);
4285                 level_code= (((2+level_code)>>1) ^ mask) - mask;
4286             }
4287             level[i]= level_code;
4288
4289             if(suffix_limit[suffix_length] + level_code > 2U*suffix_limit[suffix_length])
4290                 suffix_length++;
4291         }
4292     }
4293
4294     if(total_coeff == max_coeff)
4295         zeros_left=0;
4296     else{
4297         if(n == CHROMA_DC_BLOCK_INDEX)
4298             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4299         else
4300             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4301     }
4302
4303     coeff_num = zeros_left + total_coeff - 1;
4304     j = scantable[coeff_num];
4305     if(n > 24){
4306         block[j] = level[0];
4307         for(i=1;i<total_coeff;i++) {
4308             if(zeros_left <= 0)
4309                 run_before = 0;
4310             else if(zeros_left < 7){
4311                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4312             }else{
4313                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4314             }
4315             zeros_left -= run_before;
4316             coeff_num -= 1 + run_before;
4317             j= scantable[ coeff_num ];
4318
4319             block[j]= level[i];
4320         }
4321     }else{
4322         block[j] = (level[0] * qmul[j] + 32)>>6;
4323         for(i=1;i<total_coeff;i++) {
4324             if(zeros_left <= 0)
4325                 run_before = 0;
4326             else if(zeros_left < 7){
4327                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4328             }else{
4329                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4330             }
4331             zeros_left -= run_before;
4332             coeff_num -= 1 + run_before;
4333             j= scantable[ coeff_num ];
4334
4335             block[j]= (level[i] * qmul[j] + 32)>>6;
4336         }
4337     }
4338
4339     if(zeros_left<0){
4340         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4341         return -1;
4342     }
4343
4344     return 0;
4345 }
4346
4347 static void predict_field_decoding_flag(H264Context *h){
4348     MpegEncContext * const s = &h->s;
4349     const int mb_xy= h->mb_xy;
4350     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4351                 ? s->current_picture.mb_type[mb_xy-1]
4352                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4353                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4354                 : 0;
4355     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4356 }
4357
4358 /**
4359  * decodes a P_SKIP or B_SKIP macroblock
4360  */
4361 static void decode_mb_skip(H264Context *h){
4362     MpegEncContext * const s = &h->s;
4363     const int mb_xy= h->mb_xy;
4364     int mb_type=0;
4365
4366     memset(h->non_zero_count[mb_xy], 0, 16);
4367     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4368
4369     if(MB_FIELD)
4370         mb_type|= MB_TYPE_INTERLACED;
4371
4372     if( h->slice_type_nos == FF_B_TYPE )
4373     {
4374         // just for fill_caches. pred_direct_motion will set the real mb_type
4375         mb_type|= MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4376
4377         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4378         pred_direct_motion(h, &mb_type);
4379         mb_type|= MB_TYPE_SKIP;
4380     }
4381     else
4382     {
4383         int mx, my;
4384         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4385
4386         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4387         pred_pskip_motion(h, &mx, &my);
4388         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4389         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4390     }
4391
4392     write_back_motion(h, mb_type);
4393     s->current_picture.mb_type[mb_xy]= mb_type;
4394     s->current_picture.qscale_table[mb_xy]= s->qscale;
4395     h->slice_table[ mb_xy ]= h->slice_num;
4396     h->prev_mb_skipped= 1;
4397 }
4398
4399 /**
4400  * decodes a macroblock
4401  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4402  */
4403 static int decode_mb_cavlc(H264Context *h){
4404     MpegEncContext * const s = &h->s;
4405     int mb_xy;
4406     int partition_count;
4407     unsigned int mb_type, cbp;
4408     int dct8x8_allowed= h->pps.transform_8x8_mode;
4409
4410     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4411
4412     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4413     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4414                 down the code */
4415     if(h->slice_type_nos != FF_I_TYPE){
4416         if(s->mb_skip_run==-1)
4417             s->mb_skip_run= get_ue_golomb(&s->gb);
4418
4419         if (s->mb_skip_run--) {
4420             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4421                 if(s->mb_skip_run==0)
4422                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4423                 else
4424                     predict_field_decoding_flag(h);
4425             }
4426             decode_mb_skip(h);
4427             return 0;
4428         }
4429     }
4430     if(FRAME_MBAFF){
4431         if( (s->mb_y&1) == 0 )
4432             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4433     }
4434
4435     h->prev_mb_skipped= 0;
4436
4437     mb_type= get_ue_golomb(&s->gb);
4438     if(h->slice_type_nos == FF_B_TYPE){
4439         if(mb_type < 23){
4440             partition_count= b_mb_type_info[mb_type].partition_count;
4441             mb_type=         b_mb_type_info[mb_type].type;
4442         }else{
4443             mb_type -= 23;
4444             goto decode_intra_mb;
4445         }
4446     }else if(h->slice_type_nos == FF_P_TYPE){
4447         if(mb_type < 5){
4448             partition_count= p_mb_type_info[mb_type].partition_count;
4449             mb_type=         p_mb_type_info[mb_type].type;
4450         }else{
4451             mb_type -= 5;
4452             goto decode_intra_mb;
4453         }
4454     }else{
4455        assert(h->slice_type_nos == FF_I_TYPE);
4456         if(h->slice_type == FF_SI_TYPE && mb_type)
4457             mb_type--;
4458 decode_intra_mb:
4459         if(mb_type > 25){
4460             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4461             return -1;
4462         }
4463         partition_count=0;
4464         cbp= i_mb_type_info[mb_type].cbp;
4465         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4466         mb_type= i_mb_type_info[mb_type].type;
4467     }
4468
4469     if(MB_FIELD)
4470         mb_type |= MB_TYPE_INTERLACED;
4471
4472     h->slice_table[ mb_xy ]= h->slice_num;
4473
4474     if(IS_INTRA_PCM(mb_type)){
4475         unsigned int x;
4476
4477         // We assume these blocks are very rare so we do not optimize it.
4478         align_get_bits(&s->gb);
4479
4480         // The pixels are stored in the same order as levels in h->mb array.
4481         for(x=0; x < (CHROMA ? 384 : 256); x++){
4482             ((uint8_t*)h->mb)[x]= get_bits(&s->gb, 8);
4483         }
4484
4485         // In deblocking, the quantizer is 0
4486         s->current_picture.qscale_table[mb_xy]= 0;
4487         // All coeffs are present
4488         memset(h->non_zero_count[mb_xy], 16, 16);
4489
4490         s->current_picture.mb_type[mb_xy]= mb_type;
4491         return 0;
4492     }
4493
4494     if(MB_MBAFF){
4495         h->ref_count[0] <<= 1;
4496         h->ref_count[1] <<= 1;
4497     }
4498
4499     fill_caches(h, mb_type, 0);
4500
4501     //mb_pred
4502     if(IS_INTRA(mb_type)){
4503         int pred_mode;
4504 //            init_top_left_availability(h);
4505         if(IS_INTRA4x4(mb_type)){
4506             int i;
4507             int di = 1;
4508             if(dct8x8_allowed && get_bits1(&s->gb)){
4509                 mb_type |= MB_TYPE_8x8DCT;
4510                 di = 4;
4511             }
4512
4513 //                fill_intra4x4_pred_table(h);
4514             for(i=0; i<16; i+=di){
4515                 int mode= pred_intra_mode(h, i);
4516
4517                 if(!get_bits1(&s->gb)){
4518                     const int rem_mode= get_bits(&s->gb, 3);
4519                     mode = rem_mode + (rem_mode >= mode);
4520                 }
4521
4522                 if(di==4)
4523                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4524                 else
4525                     h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4526             }
4527             write_back_intra_pred_mode(h);
4528             if( check_intra4x4_pred_mode(h) < 0)
4529                 return -1;
4530         }else{
4531             h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4532             if(h->intra16x16_pred_mode < 0)
4533                 return -1;
4534         }
4535         if(CHROMA){
4536             pred_mode= check_intra_pred_mode(h, get_ue_golomb_31(&s->gb));
4537             if(pred_mode < 0)
4538                 return -1;
4539             h->chroma_pred_mode= pred_mode;
4540         }
4541     }else if(partition_count==4){
4542         int i, j, sub_partition_count[4], list, ref[2][4];
4543
4544         if(h->slice_type_nos == FF_B_TYPE){
4545             for(i=0; i<4; i++){
4546                 h->sub_mb_type[i]= get_ue_golomb_31(&s->gb);
4547                 if(h->sub_mb_type[i] >=13){
4548                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4549                     return -1;
4550                 }
4551                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4552                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4553             }
4554             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4555                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4556                 pred_direct_motion(h, &mb_type);
4557                 h->ref_cache[0][scan8[4]] =
4558                 h->ref_cache[1][scan8[4]] =
4559                 h->ref_cache[0][scan8[12]] =
4560                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4561             }
4562         }else{
4563             assert(h->slice_type_nos == FF_P_TYPE); //FIXME SP correct ?
4564             for(i=0; i<4; i++){
4565                 h->sub_mb_type[i]= get_ue_golomb_31(&s->gb);
4566                 if(h->sub_mb_type[i] >=4){
4567                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4568                     return -1;
4569                 }
4570                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4571                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4572             }
4573         }
4574
4575         for(list=0; list<h->list_count; list++){
4576             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4577             for(i=0; i<4; i++){
4578                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4579                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4580                     unsigned int tmp;
4581                     if(ref_count == 1){
4582                         tmp= 0;
4583                     }else if(ref_count == 2){
4584                         tmp= get_bits1(&s->gb)^1;
4585                     }else{
4586                         tmp= get_ue_golomb_31(&s->gb);
4587                         if(tmp>=ref_count){
4588                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4589                             return -1;
4590                         }
4591                     }
4592                     ref[list][i]= tmp;
4593                 }else{
4594                  //FIXME
4595                     ref[list][i] = -1;
4596                 }
4597             }
4598         }
4599
4600         if(dct8x8_allowed)
4601             dct8x8_allowed = get_dct8x8_allowed(h);
4602
4603         for(list=0; list<h->list_count; list++){
4604             for(i=0; i<4; i++){
4605                 if(IS_DIRECT(h->sub_mb_type[i])) {
4606                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4607                     continue;
4608                 }
4609                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4610                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4611
4612                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4613                     const int sub_mb_type= h->sub_mb_type[i];
4614                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4615                     for(j=0; j<sub_partition_count[i]; j++){
4616                         int mx, my;
4617                         const int index= 4*i + block_width*j;
4618                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4619                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4620                         mx += get_se_golomb(&s->gb);
4621                         my += get_se_golomb(&s->gb);
4622                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4623
4624                         if(IS_SUB_8X8(sub_mb_type)){
4625                             mv_cache[ 1 ][0]=
4626                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4627                             mv_cache[ 1 ][1]=
4628                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4629                         }else if(IS_SUB_8X4(sub_mb_type)){
4630                             mv_cache[ 1 ][0]= mx;
4631                             mv_cache[ 1 ][1]= my;
4632                         }else if(IS_SUB_4X8(sub_mb_type)){
4633                             mv_cache[ 8 ][0]= mx;
4634                             mv_cache[ 8 ][1]= my;
4635                         }
4636                         mv_cache[ 0 ][0]= mx;
4637                         mv_cache[ 0 ][1]= my;
4638                     }
4639                 }else{
4640                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4641                     p[0] = p[1]=
4642                     p[8] = p[9]= 0;
4643                 }
4644             }
4645         }
4646     }else if(IS_DIRECT(mb_type)){
4647         pred_direct_motion(h, &mb_type);
4648         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4649     }else{
4650         int list, mx, my, i;
4651          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4652         if(IS_16X16(mb_type)){
4653             for(list=0; list<h->list_count; list++){
4654                     unsigned int val;
4655                     if(IS_DIR(mb_type, 0, list)){
4656                         if(h->ref_count[list]==1){
4657                             val= 0;
4658                         }else if(h->ref_count[list]==2){
4659                             val= get_bits1(&s->gb)^1;
4660                         }else{
4661                             val= get_ue_golomb_31(&s->gb);
4662                             if(val >= h->ref_count[list]){
4663                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4664                                 return -1;
4665                             }
4666                         }
4667                     }else
4668                         val= LIST_NOT_USED&0xFF;
4669                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4670             }
4671             for(list=0; list<h->list_count; list++){
4672                 unsigned int val;
4673                 if(IS_DIR(mb_type, 0, list)){
4674                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4675                     mx += get_se_golomb(&s->gb);
4676                     my += get_se_golomb(&s->gb);
4677                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4678
4679                     val= pack16to32(mx,my);
4680                 }else
4681                     val=0;
4682                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4683             }
4684         }
4685         else if(IS_16X8(mb_type)){
4686             for(list=0; list<h->list_count; list++){
4687                     for(i=0; i<2; i++){
4688                         unsigned int val;
4689                         if(IS_DIR(mb_type, i, list)){
4690                             if(h->ref_count[list] == 1){
4691                                 val= 0;
4692                             }else if(h->ref_count[list] == 2){
4693                                 val= get_bits1(&s->gb)^1;
4694                             }else{
4695                                 val= get_ue_golomb_31(&s->gb);
4696                                 if(val >= h->ref_count[list]){
4697                                     av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4698                                     return -1;
4699                                 }
4700                             }
4701                         }else
4702                             val= LIST_NOT_USED&0xFF;
4703                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4704                     }
4705             }
4706             for(list=0; list<h->list_count; list++){
4707                 for(i=0; i<2; i++){
4708                     unsigned int val;
4709                     if(IS_DIR(mb_type, i, list)){
4710                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4711                         mx += get_se_golomb(&s->gb);
4712                         my += get_se_golomb(&s->gb);
4713                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4714
4715                         val= pack16to32(mx,my);
4716                     }else
4717                         val=0;
4718                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4719                 }
4720             }
4721         }else{
4722             assert(IS_8X16(mb_type));
4723             for(list=0; list<h->list_count; list++){
4724                     for(i=0; i<2; i++){
4725                         unsigned int val;
4726                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4727                             if(h->ref_count[list]==1){
4728                                 val= 0;
4729                             }else if(h->ref_count[list]==2){
4730                                 val= get_bits1(&s->gb)^1;
4731                             }else{
4732                                 val= get_ue_golomb_31(&s->gb);
4733                                 if(val >= h->ref_count[list]){
4734                                     av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4735                                     return -1;
4736                                 }
4737                             }
4738                         }else
4739                             val= LIST_NOT_USED&0xFF;
4740                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4741                     }
4742             }
4743             for(list=0; list<h->list_count; list++){
4744                 for(i=0; i<2; i++){
4745                     unsigned int val;
4746                     if(IS_DIR(mb_type, i, list)){
4747                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4748                         mx += get_se_golomb(&s->gb);
4749                         my += get_se_golomb(&s->gb);
4750                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4751
4752                         val= pack16to32(mx,my);
4753                     }else
4754                         val=0;
4755                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4756                 }
4757             }
4758         }
4759     }
4760
4761     if(IS_INTER(mb_type))
4762         write_back_motion(h, mb_type);
4763
4764     if(!IS_INTRA16x16(mb_type)){
4765         cbp= get_ue_golomb(&s->gb);
4766         if(cbp > 47){
4767             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4768             return -1;
4769         }
4770
4771         if(CHROMA){
4772             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp[cbp];
4773             else                     cbp= golomb_to_inter_cbp   [cbp];
4774         }else{
4775             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp_gray[cbp];
4776             else                     cbp= golomb_to_inter_cbp_gray[cbp];
4777         }
4778     }
4779     h->cbp = cbp;
4780
4781     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4782         if(get_bits1(&s->gb)){
4783             mb_type |= MB_TYPE_8x8DCT;
4784             h->cbp_table[mb_xy]= cbp;
4785         }
4786     }
4787     s->current_picture.mb_type[mb_xy]= mb_type;
4788
4789     if(cbp || IS_INTRA16x16(mb_type)){
4790         int i8x8, i4x4, chroma_idx;
4791         int dquant;
4792         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4793         const uint8_t *scan, *scan8x8, *dc_scan;
4794
4795 //        fill_non_zero_count_cache(h);
4796
4797         if(IS_INTERLACED(mb_type)){
4798             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4799             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4800             dc_scan= luma_dc_field_scan;
4801         }else{
4802             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4803             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4804             dc_scan= luma_dc_zigzag_scan;
4805         }
4806
4807         dquant= get_se_golomb(&s->gb);
4808
4809         if( dquant > 25 || dquant < -26 ){
4810             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4811             return -1;
4812         }
4813
4814         s->qscale += dquant;
4815         if(((unsigned)s->qscale) > 51){
4816             if(s->qscale<0) s->qscale+= 52;
4817             else            s->qscale-= 52;
4818         }
4819
4820         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4821         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4822         if(IS_INTRA16x16(mb_type)){
4823             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4824                 return -1; //FIXME continue if partitioned and other return -1 too
4825             }
4826
4827             assert((cbp&15) == 0 || (cbp&15) == 15);
4828
4829             if(cbp&15){
4830                 for(i8x8=0; i8x8<4; i8x8++){
4831                     for(i4x4=0; i4x4<4; i4x4++){
4832                         const int index= i4x4 + 4*i8x8;
4833                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4834                             return -1;
4835                         }
4836                     }
4837                 }
4838             }else{
4839                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4840             }
4841         }else{
4842             for(i8x8=0; i8x8<4; i8x8++){
4843                 if(cbp & (1<<i8x8)){
4844                     if(IS_8x8DCT(mb_type)){
4845                         DCTELEM *buf = &h->mb[64*i8x8];
4846                         uint8_t *nnz;
4847                         for(i4x4=0; i4x4<4; i4x4++){
4848                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4849                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4850                                 return -1;
4851                         }
4852                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4853                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4854                     }else{
4855                         for(i4x4=0; i4x4<4; i4x4++){
4856                             const int index= i4x4 + 4*i8x8;
4857
4858                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4859                                 return -1;
4860                             }
4861                         }
4862                     }
4863                 }else{
4864                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4865                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4866                 }
4867             }
4868         }
4869
4870         if(cbp&0x30){
4871             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4872                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4873                     return -1;
4874                 }
4875         }
4876
4877         if(cbp&0x20){
4878             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4879                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4880                 for(i4x4=0; i4x4<4; i4x4++){
4881                     const int index= 16 + 4*chroma_idx + i4x4;
4882                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4883                         return -1;
4884                     }
4885                 }
4886             }
4887         }else{
4888             uint8_t * const nnz= &h->non_zero_count_cache[0];
4889             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4890             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4891         }
4892     }else{
4893         uint8_t * const nnz= &h->non_zero_count_cache[0];
4894         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4895         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4896         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4897     }
4898     s->current_picture.qscale_table[mb_xy]= s->qscale;
4899     write_back_non_zero_count(h);
4900
4901     if(MB_MBAFF){
4902         h->ref_count[0] >>= 1;
4903         h->ref_count[1] >>= 1;
4904     }
4905
4906     return 0;
4907 }
4908
4909 static int decode_cabac_field_decoding_flag(H264Context *h) {
4910     MpegEncContext * const s = &h->s;
4911     const int mb_x = s->mb_x;
4912     const int mb_y = s->mb_y & ~1;
4913     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4914     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4915
4916     unsigned int ctx = 0;
4917
4918     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
4919         ctx += 1;
4920     }
4921     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
4922         ctx += 1;
4923     }
4924
4925     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
4926 }
4927
4928 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4929     uint8_t *state= &h->cabac_state[ctx_base];
4930     int mb_type;
4931
4932     if(intra_slice){
4933         MpegEncContext * const s = &h->s;
4934         const int mba_xy = h->left_mb_xy[0];
4935         const int mbb_xy = h->top_mb_xy;
4936         int ctx=0;
4937         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
4938             ctx++;
4939         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
4940             ctx++;
4941         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
4942             return 0;   /* I4x4 */
4943         state += 2;
4944     }else{
4945         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
4946             return 0;   /* I4x4 */
4947     }
4948
4949     if( get_cabac_terminate( &h->cabac ) )
4950         return 25;  /* PCM */
4951
4952     mb_type = 1; /* I16x16 */
4953     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
4954     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
4955         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
4956     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
4957     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
4958     return mb_type;
4959 }
4960
4961 static int decode_cabac_mb_type_b( H264Context *h ) {
4962     MpegEncContext * const s = &h->s;
4963
4964         const int mba_xy = h->left_mb_xy[0];
4965         const int mbb_xy = h->top_mb_xy;
4966         int ctx = 0;
4967         int bits;
4968         assert(h->slice_type_nos == FF_B_TYPE);
4969
4970         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
4971             ctx++;
4972         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
4973             ctx++;
4974
4975         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
4976             return 0; /* B_Direct_16x16 */
4977
4978         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
4979             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
4980         }
4981
4982         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
4983         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
4984         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
4985         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4986         if( bits < 8 )
4987             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
4988         else if( bits == 13 ) {
4989             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
4990         } else if( bits == 14 )
4991             return 11; /* B_L1_L0_8x16 */
4992         else if( bits == 15 )
4993             return 22; /* B_8x8 */
4994
4995         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4996         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
4997 }
4998
4999 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
5000     MpegEncContext * const s = &h->s;
5001     int mba_xy, mbb_xy;
5002     int ctx = 0;
5003
5004     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
5005         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
5006         mba_xy = mb_xy - 1;
5007         if( (mb_y&1)
5008             && h->slice_table[mba_xy] == h->slice_num
5009             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
5010             mba_xy += s->mb_stride;
5011         if( MB_FIELD ){
5012             mbb_xy = mb_xy - s->mb_stride;
5013             if( !(mb_y&1)
5014                 && h->slice_table[mbb_xy] == h->slice_num
5015                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
5016                 mbb_xy -= s->mb_stride;
5017         }else
5018             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
5019     }else{
5020         int mb_xy = h->mb_xy;
5021         mba_xy = mb_xy - 1;
5022         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
5023     }
5024
5025     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
5026         ctx++;
5027     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
5028         ctx++;
5029
5030     if( h->slice_type_nos == FF_B_TYPE )
5031         ctx += 13;
5032     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
5033 }
5034
5035 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
5036     int mode = 0;
5037
5038     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
5039         return pred_mode;
5040
5041     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
5042     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
5043     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
5044
5045     if( mode >= pred_mode )
5046         return mode + 1;
5047     else
5048         return mode;
5049 }
5050
5051 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
5052     const int mba_xy = h->left_mb_xy[0];
5053     const int mbb_xy = h->top_mb_xy;
5054
5055     int ctx = 0;
5056
5057     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
5058     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
5059         ctx++;
5060
5061     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
5062         ctx++;
5063
5064     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
5065         return 0;
5066
5067     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5068         return 1;
5069     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5070         return 2;
5071     else
5072         return 3;
5073 }
5074
5075 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5076     int cbp_b, cbp_a, ctx, cbp = 0;
5077
5078     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
5079     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
5080
5081     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
5082     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
5083     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
5084     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
5085     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
5086     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
5087     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
5088     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
5089     return cbp;
5090 }
5091 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5092     int ctx;
5093     int cbp_a, cbp_b;
5094
5095     cbp_a = (h->left_cbp>>4)&0x03;
5096     cbp_b = (h-> top_cbp>>4)&0x03;
5097
5098     ctx = 0;
5099     if( cbp_a > 0 ) ctx++;
5100     if( cbp_b > 0 ) ctx += 2;
5101     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5102         return 0;
5103
5104     ctx = 4;
5105     if( cbp_a == 2 ) ctx++;
5106     if( cbp_b == 2 ) ctx += 2;
5107     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5108 }
5109 static int decode_cabac_mb_dqp( H264Context *h) {
5110     int   ctx= h->last_qscale_diff != 0;
5111     int   val = 0;
5112
5113     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5114         ctx= 2+(ctx>>1);
5115         val++;
5116         if(val > 102) //prevent infinite loop
5117             return INT_MIN;
5118     }
5119
5120     if( val&0x01 )
5121         return   (val + 1)>>1 ;
5122     else
5123         return -((val + 1)>>1);
5124 }
5125 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5126     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5127         return 0;   /* 8x8 */
5128     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5129         return 1;   /* 8x4 */
5130     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5131         return 2;   /* 4x8 */
5132     return 3;       /* 4x4 */
5133 }
5134 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5135     int type;
5136     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5137         return 0;   /* B_Direct_8x8 */
5138     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5139         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5140     type = 3;
5141     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5142         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5143             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5144         type += 4;
5145     }
5146     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5147     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5148     return type;
5149 }
5150
5151 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5152     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5153 }
5154
5155 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5156     int refa = h->ref_cache[list][scan8[n] - 1];
5157     int refb = h->ref_cache[list][scan8[n] - 8];
5158     int ref  = 0;
5159     int ctx  = 0;
5160
5161     if( h->slice_type_nos == FF_B_TYPE) {
5162         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5163             ctx++;
5164         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5165             ctx += 2;
5166     } else {
5167         if( refa > 0 )
5168             ctx++;
5169         if( refb > 0 )
5170             ctx += 2;
5171     }
5172
5173     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5174         ref++;
5175         ctx = (ctx>>2)+4;
5176         if(ref >= 32 /*h->ref_list[list]*/){
5177             return -1;
5178         }
5179     }
5180     return ref;
5181 }
5182
5183 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5184     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5185                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5186     int ctxbase = (l == 0) ? 40 : 47;
5187     int mvd;
5188     int ctx = (amvd>2) + (amvd>32);
5189
5190     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5191         return 0;
5192
5193     mvd= 1;
5194     ctx= 3;
5195     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5196         mvd++;
5197         if( ctx < 6 )
5198             ctx++;
5199     }
5200
5201     if( mvd >= 9 ) {
5202         int k = 3;
5203         while( get_cabac_bypass( &h->cabac ) ) {
5204             mvd += 1 << k;
5205             k++;
5206             if(k>24){
5207                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5208                 return INT_MIN;
5209             }
5210         }
5211         while( k-- ) {
5212             if( get_cabac_bypass( &h->cabac ) )
5213                 mvd += 1 << k;
5214         }
5215     }
5216     return get_cabac_bypass_sign( &h->cabac, -mvd );
5217 }
5218
5219 static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
5220     int nza, nzb;
5221     int ctx = 0;
5222
5223     if( is_dc ) {
5224         if( cat == 0 ) {
5225             nza = h->left_cbp&0x100;
5226             nzb = h-> top_cbp&0x100;
5227         } else {
5228             nza = (h->left_cbp>>(6+idx))&0x01;
5229             nzb = (h-> top_cbp>>(6+idx))&0x01;
5230         }
5231     } else {
5232         assert(cat == 1 || cat == 2 || cat == 4);
5233         nza = h->non_zero_count_cache[scan8[idx] - 1];
5234         nzb = h->non_zero_count_cache[scan8[idx] - 8];
5235     }
5236
5237     if( nza > 0 )
5238         ctx++;
5239
5240     if( nzb > 0 )
5241         ctx += 2;
5242
5243     return ctx + 4 * cat;
5244 }
5245
5246 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8[63]) = {
5247     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5248     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5249     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5250     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5251 };
5252
5253 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
5254     static const int significant_coeff_flag_offset[2][6] = {
5255       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5256       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5257     };
5258     static const int last_coeff_flag_offset[2][6] = {
5259       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5260       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5261     };
5262     static const int coeff_abs_level_m1_offset[6] = {
5263         227+0, 227+10, 227+20, 227+30, 227+39, 426
5264     };
5265     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5266       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5267         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5268         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5269        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5270       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5271         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5272         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5273         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5274     };
5275     /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
5276      * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
5277      * map node ctx => cabac ctx for level=1 */
5278     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
5279     /* map node ctx => cabac ctx for level>1 */
5280     static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
5281     static const uint8_t coeff_abs_level_transition[2][8] = {
5282     /* update node ctx after decoding a level=1 */
5283         { 1, 2, 3, 3, 4, 5, 6, 7 },
5284     /* update node ctx after decoding a level>1 */
5285         { 4, 4, 4, 4, 5, 6, 7, 7 }
5286     };
5287
5288     int index[64];
5289
5290     int av_unused last;
5291     int coeff_count = 0;
5292     int node_ctx = 0;
5293
5294     uint8_t *significant_coeff_ctx_base;
5295     uint8_t *last_coeff_ctx_base;
5296     uint8_t *abs_level_m1_ctx_base;
5297
5298 #if !ARCH_X86
5299 #define CABAC_ON_STACK
5300 #endif
5301 #ifdef CABAC_ON_STACK
5302 #define CC &cc
5303     CABACContext cc;
5304     cc.range     = h->cabac.range;
5305     cc.low       = h->cabac.low;
5306     cc.bytestream= h->cabac.bytestream;
5307 #else
5308 #define CC &h->cabac
5309 #endif
5310
5311
5312     /* cat: 0-> DC 16x16  n = 0
5313      *      1-> AC 16x16  n = luma4x4idx
5314      *      2-> Luma4x4   n = luma4x4idx
5315      *      3-> DC Chroma n = iCbCr
5316      *      4-> AC Chroma n = 16 + 4 * iCbCr + chroma4x4idx
5317      *      5-> Luma8x8   n = 4 * luma8x8idx
5318      */
5319
5320     /* read coded block flag */
5321     if( is_dc || cat != 5 ) {
5322         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
5323             if( !is_dc )
5324                 h->non_zero_count_cache[scan8[n]] = 0;
5325
5326 #ifdef CABAC_ON_STACK
5327             h->cabac.range     = cc.range     ;
5328             h->cabac.low       = cc.low       ;
5329             h->cabac.bytestream= cc.bytestream;
5330 #endif
5331             return;
5332         }
5333     }
5334
5335     significant_coeff_ctx_base = h->cabac_state
5336         + significant_coeff_flag_offset[MB_FIELD][cat];
5337     last_coeff_ctx_base = h->cabac_state
5338         + last_coeff_flag_offset[MB_FIELD][cat];
5339     abs_level_m1_ctx_base = h->cabac_state
5340         + coeff_abs_level_m1_offset[cat];
5341
5342     if( !is_dc && cat == 5 ) {
5343 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5344         for(last= 0; last < coefs; last++) { \
5345             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5346             if( get_cabac( CC, sig_ctx )) { \
5347                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5348                 index[coeff_count++] = last; \
5349                 if( get_cabac( CC, last_ctx ) ) { \
5350                     last= max_coeff; \
5351                     break; \
5352                 } \
5353             } \
5354         }\
5355         if( last == max_coeff -1 ) {\
5356             index[coeff_count++] = last;\
5357         }
5358         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5359 #if ARCH_X86 && HAVE_7REGS && HAVE_EBX_AVAILABLE && !defined(BROKEN_RELOCATIONS)
5360         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5361     } else {
5362         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5363 #else
5364         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5365     } else {
5366         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5367 #endif
5368     }
5369     assert(coeff_count > 0);
5370
5371     if( is_dc ) {
5372         if( cat == 0 )
5373             h->cbp_table[h->mb_xy] |= 0x100;
5374         else
5375             h->cbp_table[h->mb_xy] |= 0x40 << n;
5376     } else {
5377         if( cat == 5 )
5378             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5379         else {
5380             assert( cat == 1 || cat == 2 || cat == 4 );
5381             h->non_zero_count_cache[scan8[n]] = coeff_count;
5382         }
5383     }
5384
5385     do {
5386         uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
5387
5388         int j= scantable[index[--coeff_count]];
5389
5390         if( get_cabac( CC, ctx ) == 0 ) {
5391             node_ctx = coeff_abs_level_transition[0][node_ctx];
5392             if( is_dc ) {
5393                 block[j] = get_cabac_bypass_sign( CC, -1);
5394             }else{
5395                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5396             }
5397         } else {
5398             int coeff_abs = 2;
5399             ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
5400             node_ctx = coeff_abs_level_transition[1][node_ctx];
5401
5402             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5403                 coeff_abs++;
5404             }
5405
5406             if( coeff_abs >= 15 ) {
5407                 int j = 0;
5408                 while( get_cabac_bypass( CC ) ) {
5409                     j++;
5410                 }
5411
5412                 coeff_abs=1;
5413                 while( j-- ) {
5414                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5415                 }
5416                 coeff_abs+= 14;
5417             }
5418
5419             if( is_dc ) {
5420                 block[j] = get_cabac_bypass_sign( CC, -coeff_abs );
5421             }else{
5422                 block[j] = (get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32) >> 6;
5423             }
5424         }
5425     } while( coeff_count );
5426 #ifdef CABAC_ON_STACK
5427             h->cabac.range     = cc.range     ;
5428             h->cabac.low       = cc.low       ;
5429             h->cabac.bytestream= cc.bytestream;
5430 #endif
5431
5432 }
5433
5434 #if !CONFIG_SMALL
5435 static void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5436     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 1);
5437 }
5438
5439 static void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5440     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
5441 }
5442 #endif
5443
5444 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5445 #if CONFIG_SMALL
5446     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, cat == 0 || cat == 3);
5447 #else
5448     if( cat == 0 || cat == 3 ) decode_cabac_residual_dc(h, block, cat, n, scantable, qmul, max_coeff);
5449     else decode_cabac_residual_nondc(h, block, cat, n, scantable, qmul, max_coeff);
5450 #endif
5451 }
5452
5453 static inline void compute_mb_neighbors(H264Context *h)
5454 {
5455     MpegEncContext * const s = &h->s;
5456     const int mb_xy  = h->mb_xy;
5457     h->top_mb_xy     = mb_xy - s->mb_stride;
5458     h->left_mb_xy[0] = mb_xy - 1;
5459     if(FRAME_MBAFF){
5460         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5461         const int top_pair_xy      = pair_xy     - s->mb_stride;
5462         const int top_mb_field_flag  = IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5463         const int left_mb_field_flag = IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5464         const int curr_mb_field_flag = MB_FIELD;
5465         const int bottom = (s->mb_y & 1);
5466
5467         if (curr_mb_field_flag && (bottom || top_mb_field_flag)){
5468             h->top_mb_xy -= s->mb_stride;
5469         }
5470         if (!left_mb_field_flag == curr_mb_field_flag) {
5471             h->left_mb_xy[0] = pair_xy - 1;
5472         }
5473     } else if (FIELD_PICTURE) {
5474         h->top_mb_xy -= s->mb_stride;
5475     }
5476     return;
5477 }
5478
5479 /**
5480  * decodes a macroblock
5481  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5482  */
5483 static int decode_mb_cabac(H264Context *h) {
5484     MpegEncContext * const s = &h->s;
5485     int mb_xy;
5486     int mb_type, partition_count, cbp = 0;
5487     int dct8x8_allowed= h->pps.transform_8x8_mode;
5488
5489     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5490
5491     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5492     if( h->slice_type_nos != FF_I_TYPE ) {
5493         int skip;
5494         /* a skipped mb needs the aff flag from the following mb */
5495         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5496             predict_field_decoding_flag(h);
5497         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5498             skip = h->next_mb_skipped;
5499         else
5500             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5501         /* read skip flags */
5502         if( skip ) {
5503             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5504                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5505                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5506                 if(!h->next_mb_skipped)
5507                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5508             }
5509
5510             decode_mb_skip(h);
5511
5512             h->cbp_table[mb_xy] = 0;
5513             h->chroma_pred_mode_table[mb_xy] = 0;
5514             h->last_qscale_diff = 0;
5515
5516             return 0;
5517
5518         }
5519     }
5520     if(FRAME_MBAFF){
5521         if( (s->mb_y&1) == 0 )
5522             h->mb_mbaff =
5523             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5524     }
5525
5526     h->prev_mb_skipped = 0;
5527
5528     compute_mb_neighbors(h);
5529
5530     if( h->slice_type_nos == FF_B_TYPE ) {
5531         mb_type = decode_cabac_mb_type_b( h );
5532         if( mb_type < 23 ){
5533             partition_count= b_mb_type_info[mb_type].partition_count;
5534             mb_type=         b_mb_type_info[mb_type].type;
5535         }else{
5536             mb_type -= 23;
5537             goto decode_intra_mb;
5538         }
5539     } else if( h->slice_type_nos == FF_P_TYPE ) {
5540         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
5541             /* P-type */
5542             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
5543                 /* P_L0_D16x16, P_8x8 */
5544                 mb_type= 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
5545             } else {
5546                 /* P_L0_D8x16, P_L0_D16x8 */
5547                 mb_type= 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
5548             }
5549             partition_count= p_mb_type_info[mb_type].partition_count;
5550             mb_type=         p_mb_type_info[mb_type].type;
5551         } else {
5552             mb_type= decode_cabac_intra_mb_type(h, 17, 0);
5553             goto decode_intra_mb;
5554         }
5555     } else {
5556         mb_type= decode_cabac_intra_mb_type(h, 3, 1);
5557         if(h->slice_type == FF_SI_TYPE && mb_type)
5558             mb_type--;
5559         assert(h->slice_type_nos == FF_I_TYPE);
5560 decode_intra_mb:
5561         partition_count = 0;
5562         cbp= i_mb_type_info[mb_type].cbp;
5563         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5564         mb_type= i_mb_type_info[mb_type].type;
5565     }
5566     if(MB_FIELD)
5567         mb_type |= MB_TYPE_INTERLACED;
5568
5569     h->slice_table[ mb_xy ]= h->slice_num;
5570
5571     if(IS_INTRA_PCM(mb_type)) {
5572         const uint8_t *ptr;
5573
5574         // We assume these blocks are very rare so we do not optimize it.
5575         // FIXME The two following lines get the bitstream position in the cabac
5576         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5577         ptr= h->cabac.bytestream;
5578         if(h->cabac.low&0x1) ptr--;
5579         if(CABAC_BITS==16){
5580             if(h->cabac.low&0x1FF) ptr--;
5581         }
5582
5583         // The pixels are stored in the same order as levels in h->mb array.
5584         memcpy(h->mb, ptr, 256); ptr+=256;
5585         if(CHROMA){
5586             memcpy(h->mb+128, ptr, 128); ptr+=128;
5587         }
5588
5589         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5590
5591         // All blocks are present
5592         h->cbp_table[mb_xy] = 0x1ef;
5593         h->chroma_pred_mode_table[mb_xy] = 0;
5594         // In deblocking, the quantizer is 0
5595         s->current_picture.qscale_table[mb_xy]= 0;
5596         // All coeffs are present
5597         memset(h->non_zero_count[mb_xy], 16, 16);
5598         s->current_picture.mb_type[mb_xy]= mb_type;
5599         h->last_qscale_diff = 0;
5600         return 0;
5601     }
5602
5603     if(MB_MBAFF){
5604         h->ref_count[0] <<= 1;
5605         h->ref_count[1] <<= 1;
5606     }
5607
5608     fill_caches(h, mb_type, 0);
5609
5610     if( IS_INTRA( mb_type ) ) {
5611         int i, pred_mode;
5612         if( IS_INTRA4x4( mb_type ) ) {
5613             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5614                 mb_type |= MB_TYPE_8x8DCT;
5615                 for( i = 0; i < 16; i+=4 ) {
5616                     int pred = pred_intra_mode( h, i );
5617                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5618                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5619                 }
5620             } else {
5621                 for( i = 0; i < 16; i++ ) {
5622                     int pred = pred_intra_mode( h, i );
5623                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5624
5625                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5626                 }
5627             }
5628             write_back_intra_pred_mode(h);
5629             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5630         } else {
5631             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5632             if( h->intra16x16_pred_mode < 0 ) return -1;
5633         }
5634         if(CHROMA){
5635             h->chroma_pred_mode_table[mb_xy] =
5636             pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5637
5638             pred_mode= check_intra_pred_mode( h, pred_mode );
5639             if( pred_mode < 0 ) return -1;
5640             h->chroma_pred_mode= pred_mode;
5641         }
5642     } else if( partition_count == 4 ) {
5643         int i, j, sub_partition_count[4], list, ref[2][4];
5644
5645         if( h->slice_type_nos == FF_B_TYPE ) {
5646             for( i = 0; i < 4; i++ ) {
5647                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5648                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5649                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5650             }
5651             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5652                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5653                 pred_direct_motion(h, &mb_type);
5654                 h->ref_cache[0][scan8[4]] =
5655                 h->ref_cache[1][scan8[4]] =
5656                 h->ref_cache[0][scan8[12]] =
5657                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5658                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5659                     for( i = 0; i < 4; i++ )
5660                         if( IS_DIRECT(h->sub_mb_type[i]) )
5661                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5662                 }
5663             }
5664         } else {
5665             for( i = 0; i < 4; i++ ) {
5666                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5667                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5668                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5669             }
5670         }
5671
5672         for( list = 0; list < h->list_count; list++ ) {
5673                 for( i = 0; i < 4; i++ ) {
5674                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5675                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5676                         if( h->ref_count[list] > 1 ){
5677                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5678                             if(ref[list][i] >= (unsigned)h->ref_count[list]){
5679                                 av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref[list][i], h->ref_count[list]);
5680                                 return -1;
5681                             }
5682                         }else
5683                             ref[list][i] = 0;
5684                     } else {
5685                         ref[list][i] = -1;
5686                     }
5687                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5688                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5689                 }
5690         }
5691
5692         if(dct8x8_allowed)
5693             dct8x8_allowed = get_dct8x8_allowed(h);
5694
5695         for(list=0; list<h->list_count; list++){
5696             for(i=0; i<4; i++){
5697                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5698                 if(IS_DIRECT(h->sub_mb_type[i])){
5699                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5700                     continue;
5701                 }
5702
5703                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5704                     const int sub_mb_type= h->sub_mb_type[i];
5705                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5706                     for(j=0; j<sub_partition_count[i]; j++){
5707                         int mpx, mpy;
5708                         int mx, my;
5709                         const int index= 4*i + block_width*j;
5710                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5711                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5712                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5713
5714                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5715                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5716                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5717
5718                         if(IS_SUB_8X8(sub_mb_type)){
5719                             mv_cache[ 1 ][0]=
5720                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5721                             mv_cache[ 1 ][1]=
5722                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5723
5724                             mvd_cache[ 1 ][0]=
5725                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5726                             mvd_cache[ 1 ][1]=
5727                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5728                         }else if(IS_SUB_8X4(sub_mb_type)){
5729                             mv_cache[ 1 ][0]= mx;
5730                             mv_cache[ 1 ][1]= my;
5731
5732                             mvd_cache[ 1 ][0]= mx - mpx;
5733                             mvd_cache[ 1 ][1]= my - mpy;
5734                         }else if(IS_SUB_4X8(sub_mb_type)){
5735                             mv_cache[ 8 ][0]= mx;
5736                             mv_cache[ 8 ][1]= my;
5737
5738                             mvd_cache[ 8 ][0]= mx - mpx;
5739                             mvd_cache[ 8 ][1]= my - mpy;
5740                         }
5741                         mv_cache[ 0 ][0]= mx;
5742                         mv_cache[ 0 ][1]= my;
5743
5744                         mvd_cache[ 0 ][0]= mx - mpx;
5745                         mvd_cache[ 0 ][1]= my - mpy;
5746                     }
5747                 }else{
5748                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5749                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5750                     p[0] = p[1] = p[8] = p[9] = 0;
5751                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5752                 }
5753             }
5754         }
5755     } else if( IS_DIRECT(mb_type) ) {
5756         pred_direct_motion(h, &mb_type);
5757         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5758         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5759         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5760     } else {
5761         int list, mx, my, i, mpx, mpy;
5762         if(IS_16X16(mb_type)){
5763             for(list=0; list<h->list_count; list++){
5764                 if(IS_DIR(mb_type, 0, list)){
5765                     int ref;
5766                     if(h->ref_count[list] > 1){
5767                         ref= decode_cabac_mb_ref(h, list, 0);
5768                         if(ref >= (unsigned)h->ref_count[list]){
5769                             av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5770                             return -1;
5771                         }
5772                     }else
5773                         ref=0;
5774                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5775                 }else
5776                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5777             }
5778             for(list=0; list<h->list_count; list++){
5779                 if(IS_DIR(mb_type, 0, list)){
5780                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5781
5782                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5783                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5784                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5785
5786                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5787                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5788                 }else
5789                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5790             }
5791         }
5792         else if(IS_16X8(mb_type)){
5793             for(list=0; list<h->list_count; list++){
5794                     for(i=0; i<2; i++){
5795                         if(IS_DIR(mb_type, i, list)){
5796                             int ref;
5797                             if(h->ref_count[list] > 1){
5798                                 ref= decode_cabac_mb_ref( h, list, 8*i );
5799                                 if(ref >= (unsigned)h->ref_count[list]){
5800                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5801                                     return -1;
5802                                 }
5803                             }else
5804                                 ref=0;
5805                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5806                         }else
5807                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5808                     }
5809             }
5810             for(list=0; list<h->list_count; list++){
5811                 for(i=0; i<2; i++){
5812                     if(IS_DIR(mb_type, i, list)){
5813                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5814                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5815                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5816                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5817
5818                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5819                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5820                     }else{
5821                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5822                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5823                     }
5824                 }
5825             }
5826         }else{
5827             assert(IS_8X16(mb_type));
5828             for(list=0; list<h->list_count; list++){
5829                     for(i=0; i<2; i++){
5830                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5831                             int ref;
5832                             if(h->ref_count[list] > 1){
5833                                 ref= decode_cabac_mb_ref( h, list, 4*i );
5834                                 if(ref >= (unsigned)h->ref_count[list]){
5835                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5836                                     return -1;
5837                                 }
5838                             }else
5839                                 ref=0;
5840                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5841                         }else
5842                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5843                     }
5844             }
5845             for(list=0; list<h->list_count; list++){
5846                 for(i=0; i<2; i++){
5847                     if(IS_DIR(mb_type, i, list)){
5848                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5849                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5850                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5851
5852                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5853                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5854                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5855                     }else{
5856                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5857                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5858                     }
5859                 }
5860             }
5861         }
5862     }
5863
5864    if( IS_INTER( mb_type ) ) {
5865         h->chroma_pred_mode_table[mb_xy] = 0;
5866         write_back_motion( h, mb_type );
5867    }
5868
5869     if( !IS_INTRA16x16( mb_type ) ) {
5870         cbp  = decode_cabac_mb_cbp_luma( h );
5871         if(CHROMA)
5872             cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5873     }
5874
5875     h->cbp_table[mb_xy] = h->cbp = cbp;
5876
5877     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5878         if( decode_cabac_mb_transform_size( h ) )
5879             mb_type |= MB_TYPE_8x8DCT;
5880     }
5881     s->current_picture.mb_type[mb_xy]= mb_type;
5882
5883     if( cbp || IS_INTRA16x16( mb_type ) ) {
5884         const uint8_t *scan, *scan8x8, *dc_scan;
5885         const uint32_t *qmul;
5886         int dqp;
5887
5888         if(IS_INTERLACED(mb_type)){
5889             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5890             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5891             dc_scan= luma_dc_field_scan;
5892         }else{
5893             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5894             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5895             dc_scan= luma_dc_zigzag_scan;
5896         }
5897
5898         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5899         if( dqp == INT_MIN ){
5900             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5901             return -1;
5902         }
5903         s->qscale += dqp;
5904         if(((unsigned)s->qscale) > 51){
5905             if(s->qscale<0) s->qscale+= 52;
5906             else            s->qscale-= 52;
5907         }
5908         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5909         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5910
5911         if( IS_INTRA16x16( mb_type ) ) {
5912             int i;
5913             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5914             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5915
5916             if( cbp&15 ) {
5917                 qmul = h->dequant4_coeff[0][s->qscale];
5918                 for( i = 0; i < 16; i++ ) {
5919                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5920                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
5921                 }
5922             } else {
5923                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5924             }
5925         } else {
5926             int i8x8, i4x4;
5927             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5928                 if( cbp & (1<<i8x8) ) {
5929                     if( IS_8x8DCT(mb_type) ) {
5930                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
5931                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
5932                     } else {
5933                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
5934                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
5935                             const int index = 4*i8x8 + i4x4;
5936                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
5937 //START_TIMER
5938                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
5939 //STOP_TIMER("decode_residual")
5940                         }
5941                     }
5942                 } else {
5943                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5944                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5945                 }
5946             }
5947         }
5948
5949         if( cbp&0x30 ){
5950             int c;
5951             for( c = 0; c < 2; c++ ) {
5952                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
5953                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
5954             }
5955         }
5956
5957         if( cbp&0x20 ) {
5958             int c, i;
5959             for( c = 0; c < 2; c++ ) {
5960                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
5961                 for( i = 0; i < 4; i++ ) {
5962                     const int index = 16 + 4 * c + i;
5963                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
5964                     decode_cabac_residual(h, h->mb + 16*index, 4, index, scan + 1, qmul, 15);
5965                 }
5966             }
5967         } else {
5968             uint8_t * const nnz= &h->non_zero_count_cache[0];
5969             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5970             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5971         }
5972     } else {
5973         uint8_t * const nnz= &h->non_zero_count_cache[0];
5974         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5975         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5976         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5977         h->last_qscale_diff = 0;
5978     }
5979
5980     s->current_picture.qscale_table[mb_xy]= s->qscale;
5981     write_back_non_zero_count(h);
5982
5983     if(MB_MBAFF){
5984         h->ref_count[0] >>= 1;
5985         h->ref_count[1] >>= 1;
5986     }
5987
5988     return 0;
5989 }
5990
5991
5992 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5993     const int index_a = qp + h->slice_alpha_c0_offset;
5994     const int alpha = (alpha_table+52)[index_a];
5995     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5996
5997     if( bS[0] < 4 ) {
5998         int8_t tc[4];
5999         tc[0] = (tc0_table+52)[index_a][bS[0]];
6000         tc[1] = (tc0_table+52)[index_a][bS[1]];
6001         tc[2] = (tc0_table+52)[index_a][bS[2]];
6002         tc[3] = (tc0_table+52)[index_a][bS[3]];
6003         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
6004     } else {
6005         h->s.dsp.h264_h_loop_filter_luma_intra(pix, stride, alpha, beta);
6006     }
6007 }
6008 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6009     const int index_a = qp + h->slice_alpha_c0_offset;
6010     const int alpha = (alpha_table+52)[index_a];
6011     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6012
6013     if( bS[0] < 4 ) {
6014         int8_t tc[4];
6015         tc[0] = (tc0_table+52)[index_a][bS[0]]+1;
6016         tc[1] = (tc0_table+52)[index_a][bS[1]]+1;
6017         tc[2] = (tc0_table+52)[index_a][bS[2]]+1;
6018         tc[3] = (tc0_table+52)[index_a][bS[3]]+1;
6019         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
6020     } else {
6021         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
6022     }
6023 }
6024
6025 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6026     int i;
6027     for( i = 0; i < 16; i++, pix += stride) {
6028         int index_a;
6029         int alpha;
6030         int beta;
6031
6032         int qp_index;
6033         int bS_index = (i >> 1);
6034         if (!MB_FIELD) {
6035             bS_index &= ~1;
6036             bS_index |= (i & 1);
6037         }
6038
6039         if( bS[bS_index] == 0 ) {
6040             continue;
6041         }
6042
6043         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
6044         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6045         alpha = (alpha_table+52)[index_a];
6046         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6047
6048         if( bS[bS_index] < 4 ) {
6049             const int tc0 = (tc0_table+52)[index_a][bS[bS_index]];
6050             const int p0 = pix[-1];
6051             const int p1 = pix[-2];
6052             const int p2 = pix[-3];
6053             const int q0 = pix[0];
6054             const int q1 = pix[1];
6055             const int q2 = pix[2];
6056
6057             if( FFABS( p0 - q0 ) < alpha &&
6058                 FFABS( p1 - p0 ) < beta &&
6059                 FFABS( q1 - q0 ) < beta ) {
6060                 int tc = tc0;
6061                 int i_delta;
6062
6063                 if( FFABS( p2 - p0 ) < beta ) {
6064                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6065                     tc++;
6066                 }
6067                 if( FFABS( q2 - q0 ) < beta ) {
6068                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6069                     tc++;
6070                 }
6071
6072                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6073                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6074                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6075                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6076             }
6077         }else{
6078             const int p0 = pix[-1];
6079             const int p1 = pix[-2];
6080             const int p2 = pix[-3];
6081
6082             const int q0 = pix[0];
6083             const int q1 = pix[1];
6084             const int q2 = pix[2];
6085
6086             if( FFABS( p0 - q0 ) < alpha &&
6087                 FFABS( p1 - p0 ) < beta &&
6088                 FFABS( q1 - q0 ) < beta ) {
6089
6090                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6091                     if( FFABS( p2 - p0 ) < beta)
6092                     {
6093                         const int p3 = pix[-4];
6094                         /* p0', p1', p2' */
6095                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6096                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6097                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6098                     } else {
6099                         /* p0' */
6100                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6101                     }
6102                     if( FFABS( q2 - q0 ) < beta)
6103                     {
6104                         const int q3 = pix[3];
6105                         /* q0', q1', q2' */
6106                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6107                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6108                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6109                     } else {
6110                         /* q0' */
6111                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6112                     }
6113                 }else{
6114                     /* p0', q0' */
6115                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6116                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6117                 }
6118                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6119             }
6120         }
6121     }
6122 }
6123 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6124     int i;
6125     for( i = 0; i < 8; i++, pix += stride) {
6126         int index_a;
6127         int alpha;
6128         int beta;
6129
6130         int qp_index;
6131         int bS_index = i;
6132
6133         if( bS[bS_index] == 0 ) {
6134             continue;
6135         }
6136
6137         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6138         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6139         alpha = (alpha_table+52)[index_a];
6140         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6141
6142         if( bS[bS_index] < 4 ) {
6143             const int tc = (tc0_table+52)[index_a][bS[bS_index]] + 1;
6144             const int p0 = pix[-1];
6145             const int p1 = pix[-2];
6146             const int q0 = pix[0];
6147             const int q1 = pix[1];
6148
6149             if( FFABS( p0 - q0 ) < alpha &&
6150                 FFABS( p1 - p0 ) < beta &&
6151                 FFABS( q1 - q0 ) < beta ) {
6152                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6153
6154                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6155                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6156                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6157             }
6158         }else{
6159             const int p0 = pix[-1];
6160             const int p1 = pix[-2];
6161             const int q0 = pix[0];
6162             const int q1 = pix[1];
6163
6164             if( FFABS( p0 - q0 ) < alpha &&
6165                 FFABS( p1 - p0 ) < beta &&
6166                 FFABS( q1 - q0 ) < beta ) {
6167
6168                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6169                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6170                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6171             }
6172         }
6173     }
6174 }
6175
6176 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6177     const int index_a = qp + h->slice_alpha_c0_offset;
6178     const int alpha = (alpha_table+52)[index_a];
6179     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6180
6181     if( bS[0] < 4 ) {
6182         int8_t tc[4];
6183         tc[0] = (tc0_table+52)[index_a][bS[0]];
6184         tc[1] = (tc0_table+52)[index_a][bS[1]];
6185         tc[2] = (tc0_table+52)[index_a][bS[2]];
6186         tc[3] = (tc0_table+52)[index_a][bS[3]];
6187         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6188     } else {
6189         h->s.dsp.h264_v_loop_filter_luma_intra(pix, stride, alpha, beta);
6190     }
6191 }
6192
6193 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6194     const int index_a = qp + h->slice_alpha_c0_offset;
6195     const int alpha = (alpha_table+52)[index_a];
6196     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6197
6198     if( bS[0] < 4 ) {
6199         int8_t tc[4];
6200         tc[0] = (tc0_table+52)[index_a][bS[0]]+1;
6201         tc[1] = (tc0_table+52)[index_a][bS[1]]+1;
6202         tc[2] = (tc0_table+52)[index_a][bS[2]]+1;
6203         tc[3] = (tc0_table+52)[index_a][bS[3]]+1;
6204         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6205     } else {
6206         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6207     }
6208 }
6209
6210 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6211     MpegEncContext * const s = &h->s;
6212     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6213     int mb_xy, mb_type;
6214     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6215
6216     mb_xy = h->mb_xy;
6217
6218     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6219         !(s->flags2 & CODEC_FLAG2_FAST) || //FIXME filter_mb_fast is broken, thus hasto be, but should not under CODEC_FLAG2_FAST
6220        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6221                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6222         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6223         return;
6224     }
6225     assert(!FRAME_MBAFF);
6226
6227     mb_type = s->current_picture.mb_type[mb_xy];
6228     qp = s->current_picture.qscale_table[mb_xy];
6229     qp0 = s->current_picture.qscale_table[mb_xy-1];
6230     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6231     qpc = get_chroma_qp( h, 0, qp );
6232     qpc0 = get_chroma_qp( h, 0, qp0 );
6233     qpc1 = get_chroma_qp( h, 0, qp1 );
6234     qp0 = (qp + qp0 + 1) >> 1;
6235     qp1 = (qp + qp1 + 1) >> 1;
6236     qpc0 = (qpc + qpc0 + 1) >> 1;
6237     qpc1 = (qpc + qpc1 + 1) >> 1;
6238     qp_thresh = 15 - h->slice_alpha_c0_offset;
6239     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6240        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6241         return;
6242
6243     if( IS_INTRA(mb_type) ) {
6244         int16_t bS4[4] = {4,4,4,4};
6245         int16_t bS3[4] = {3,3,3,3};
6246         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6247         if( IS_8x8DCT(mb_type) ) {
6248             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6249             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6250             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6251             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6252         } else {
6253             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6254             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6255             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6256             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6257             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6258             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6259             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6260             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6261         }
6262         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6263         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6264         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6265         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6266         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6267         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6268         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6269         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6270         return;
6271     } else {
6272         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6273         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6274         int edges;
6275         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6276             edges = 4;
6277             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6278         } else {
6279             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6280                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6281             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6282                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6283                              ? 3 : 0;
6284             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6285             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6286             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6287                                               (h->slice_type_nos == FF_B_TYPE), edges, step, mask_edge0, mask_edge1, FIELD_PICTURE);
6288         }
6289         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6290             bSv[0][0] = 0x0004000400040004ULL;
6291         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6292             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6293
6294 #define FILTER(hv,dir,edge)\
6295         if(bSv[dir][edge]) {\
6296             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6297             if(!(edge&1)) {\
6298                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6299                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6300             }\
6301         }
6302         if( edges == 1 ) {
6303             FILTER(v,0,0);
6304             FILTER(h,1,0);
6305         } else if( IS_8x8DCT(mb_type) ) {
6306             FILTER(v,0,0);
6307             FILTER(v,0,2);
6308             FILTER(h,1,0);
6309             FILTER(h,1,2);
6310         } else {
6311             FILTER(v,0,0);
6312             FILTER(v,0,1);
6313             FILTER(v,0,2);
6314             FILTER(v,0,3);
6315             FILTER(h,1,0);
6316             FILTER(h,1,1);
6317             FILTER(h,1,2);
6318             FILTER(h,1,3);
6319         }
6320 #undef FILTER
6321     }
6322 }
6323
6324
6325 static av_always_inline void filter_mb_dir(H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int mb_xy, int mb_type, int mvy_limit, int first_vertical_edge_done, int dir) {
6326     MpegEncContext * const s = &h->s;
6327     int edge;
6328     const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6329     const int mbm_type = s->current_picture.mb_type[mbm_xy];
6330     int (*ref2frm) [64] = h->ref2frm[ h->slice_num          &(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6331     int (*ref2frmm)[64] = h->ref2frm[ h->slice_table[mbm_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6332     int start = h->slice_table[mbm_xy] == 0xFFFF ? 1 : 0;
6333
6334     const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6335                               == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6336     // how often to recheck mv-based bS when iterating between edges
6337     const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6338                           (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6339     // how often to recheck mv-based bS when iterating along each edge
6340     const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6341
6342     if (first_vertical_edge_done) {
6343         start = 1;
6344     }
6345
6346     if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6347         start = 1;
6348
6349     if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6350         && !IS_INTERLACED(mb_type)
6351         && IS_INTERLACED(mbm_type)
6352         ) {
6353         // This is a special case in the norm where the filtering must
6354         // be done twice (one each of the field) even if we are in a
6355         // frame macroblock.
6356         //
6357         static const int nnz_idx[4] = {4,5,6,3};
6358         unsigned int tmp_linesize   = 2 *   linesize;
6359         unsigned int tmp_uvlinesize = 2 * uvlinesize;
6360         int mbn_xy = mb_xy - 2 * s->mb_stride;
6361         int qp;
6362         int i, j;
6363         int16_t bS[4];
6364
6365         for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6366             if( IS_INTRA(mb_type) ||
6367                 IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6368                 bS[0] = bS[1] = bS[2] = bS[3] = 3;
6369             } else {
6370                 const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6371                 for( i = 0; i < 4; i++ ) {
6372                     if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6373                         mbn_nnz[nnz_idx[i]] != 0 )
6374                         bS[i] = 2;
6375                     else
6376                         bS[i] = 1;
6377                 }
6378             }
6379             // Do not use s->qscale as luma quantizer because it has not the same
6380             // value in IPCM macroblocks.
6381             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6382             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6383             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6384             filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6385             filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6386                               ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6387             filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6388                               ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6389         }
6390
6391         start = 1;
6392     }
6393
6394     /* Calculate bS */
6395     for( edge = start; edge < edges; edge++ ) {
6396         /* mbn_xy: neighbor macroblock */
6397         const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6398         const int mbn_type = s->current_picture.mb_type[mbn_xy];
6399         int (*ref2frmn)[64] = edge > 0 ? ref2frm : ref2frmm;
6400         int16_t bS[4];
6401         int qp;
6402
6403         if( (edge&1) && IS_8x8DCT(mb_type) )
6404             continue;
6405
6406         if( IS_INTRA(mb_type) ||
6407             IS_INTRA(mbn_type) ) {
6408             int value;
6409             if (edge == 0) {
6410                 if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6411                     || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6412                 ) {
6413                     value = 4;
6414                 } else {
6415                     value = 3;
6416                 }
6417             } else {
6418                 value = 3;
6419             }
6420             bS[0] = bS[1] = bS[2] = bS[3] = value;
6421         } else {
6422             int i, l;
6423             int mv_done;
6424
6425             if( edge & mask_edge ) {
6426                 bS[0] = bS[1] = bS[2] = bS[3] = 0;
6427                 mv_done = 1;
6428             }
6429             else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6430                 bS[0] = bS[1] = bS[2] = bS[3] = 1;
6431                 mv_done = 1;
6432             }
6433             else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6434                 int b_idx= 8 + 4 + edge * (dir ? 8:1);
6435                 int bn_idx= b_idx - (dir ? 8:1);
6436                 int v = 0;
6437
6438                 for( l = 0; !v && l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6439                     v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6440                          FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6441                          FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6442                 }
6443
6444                 if(h->slice_type_nos == FF_B_TYPE && v){
6445                     v=0;
6446                     for( l = 0; !v && l < 2; l++ ) {
6447                         int ln= 1-l;
6448                         v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6449                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6450                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit;
6451                     }
6452                 }
6453
6454                 bS[0] = bS[1] = bS[2] = bS[3] = v;
6455                 mv_done = 1;
6456             }
6457             else
6458                 mv_done = 0;
6459
6460             for( i = 0; i < 4; i++ ) {
6461                 int x = dir == 0 ? edge : i;
6462                 int y = dir == 0 ? i    : edge;
6463                 int b_idx= 8 + 4 + x + 8*y;
6464                 int bn_idx= b_idx - (dir ? 8:1);
6465
6466                 if( h->non_zero_count_cache[b_idx] |
6467                     h->non_zero_count_cache[bn_idx] ) {
6468                     bS[i] = 2;
6469                 }
6470                 else if(!mv_done)
6471                 {
6472                     bS[i] = 0;
6473                     for( l = 0; l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6474                         if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6475                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6476                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6477                             bS[i] = 1;
6478                             break;
6479                         }
6480                     }
6481
6482                     if(h->slice_type_nos == FF_B_TYPE && bS[i]){
6483                         bS[i] = 0;
6484                         for( l = 0; l < 2; l++ ) {
6485                             int ln= 1-l;
6486                             if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6487                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6488                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) {
6489                                 bS[i] = 1;
6490                                 break;
6491                             }
6492                         }
6493                     }
6494                 }
6495             }
6496
6497             if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6498                 continue;
6499         }
6500
6501         /* Filter edge */
6502         // Do not use s->qscale as luma quantizer because it has not the same
6503         // value in IPCM macroblocks.
6504         qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6505         //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6506         tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6507         { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6508         if( dir == 0 ) {
6509             filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6510             if( (edge&1) == 0 ) {
6511                 filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6512                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6513                 filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6514                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6515             }
6516         } else {
6517             filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6518             if( (edge&1) == 0 ) {
6519                 filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6520                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6521                 filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6522                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6523             }
6524         }
6525     }
6526 }
6527
6528 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6529     MpegEncContext * const s = &h->s;
6530     const int mb_xy= mb_x + mb_y*s->mb_stride;
6531     const int mb_type = s->current_picture.mb_type[mb_xy];
6532     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6533     int first_vertical_edge_done = 0;
6534     av_unused int dir;
6535
6536     //for sufficiently low qp, filtering wouldn't do anything
6537     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6538     if(!FRAME_MBAFF){
6539         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
6540         int qp = s->current_picture.qscale_table[mb_xy];
6541         if(qp <= qp_thresh
6542            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6543            && (h->top_mb_xy < 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6544             return;
6545         }
6546     }
6547
6548     // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
6549     if(!h->pps.cabac && h->pps.transform_8x8_mode){
6550         int top_type, left_type[2];
6551         top_type     = s->current_picture.mb_type[h->top_mb_xy]    ;
6552         left_type[0] = s->current_picture.mb_type[h->left_mb_xy[0]];
6553         left_type[1] = s->current_picture.mb_type[h->left_mb_xy[1]];
6554
6555         if(IS_8x8DCT(top_type)){
6556             h->non_zero_count_cache[4+8*0]=
6557             h->non_zero_count_cache[5+8*0]= h->cbp_table[h->top_mb_xy] & 4;
6558             h->non_zero_count_cache[6+8*0]=
6559             h->non_zero_count_cache[7+8*0]= h->cbp_table[h->top_mb_xy] & 8;
6560         }
6561         if(IS_8x8DCT(left_type[0])){
6562             h->non_zero_count_cache[3+8*1]=
6563             h->non_zero_count_cache[3+8*2]= h->cbp_table[h->left_mb_xy[0]]&2; //FIXME check MBAFF
6564         }
6565         if(IS_8x8DCT(left_type[1])){
6566             h->non_zero_count_cache[3+8*3]=
6567             h->non_zero_count_cache[3+8*4]= h->cbp_table[h->left_mb_xy[1]]&8; //FIXME check MBAFF
6568         }
6569
6570         if(IS_8x8DCT(mb_type)){
6571             h->non_zero_count_cache[scan8[0   ]]= h->non_zero_count_cache[scan8[1   ]]=
6572             h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp & 1;
6573
6574             h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
6575             h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp & 2;
6576
6577             h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
6578             h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp & 4;
6579
6580             h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
6581             h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp & 8;
6582         }
6583     }
6584
6585     if (FRAME_MBAFF
6586             // left mb is in picture
6587             && h->slice_table[mb_xy-1] != 0xFFFF
6588             // and current and left pair do not have the same interlaced type
6589             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6590             // and left mb is in the same slice if deblocking_filter == 2
6591             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6592         /* First vertical edge is different in MBAFF frames
6593          * There are 8 different bS to compute and 2 different Qp
6594          */
6595         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6596         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6597         int16_t bS[8];
6598         int qp[2];
6599         int bqp[2];
6600         int rqp[2];
6601         int mb_qp, mbn0_qp, mbn1_qp;
6602         int i;
6603         first_vertical_edge_done = 1;
6604
6605         if( IS_INTRA(mb_type) )
6606             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6607         else {
6608             for( i = 0; i < 8; i++ ) {
6609                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6610
6611                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6612                     bS[i] = 4;
6613                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6614                          ((!h->pps.cabac && IS_8x8DCT(s->current_picture.mb_type[mbn_xy])) ?
6615                             (h->cbp_table[mbn_xy] & ((MB_FIELD ? (i&2) : (mb_y&1)) ? 8 : 2))
6616                                                                        :
6617                             h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2]))
6618                     bS[i] = 2;
6619                 else
6620                     bS[i] = 1;
6621             }
6622         }
6623
6624         mb_qp = s->current_picture.qscale_table[mb_xy];
6625         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6626         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6627         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6628         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6629                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6630         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6631                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6632         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6633         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6634                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6635         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6636                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6637
6638         /* Filter edge */
6639         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6640         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6641         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6642         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6643         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6644     }
6645
6646 #if CONFIG_SMALL
6647     for( dir = 0; dir < 2; dir++ )
6648         filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, dir ? 0 : first_vertical_edge_done, dir);
6649 #else
6650     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, first_vertical_edge_done, 0);
6651     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, 0, 1);
6652 #endif
6653 }
6654
6655 static int decode_slice(struct AVCodecContext *avctx, void *arg){
6656     H264Context *h = *(void**)arg;
6657     MpegEncContext * const s = &h->s;
6658     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6659
6660     s->mb_skip_run= -1;
6661
6662     h->is_complex = FRAME_MBAFF || s->picture_structure != PICT_FRAME || s->codec_id != CODEC_ID_H264 ||
6663                     (CONFIG_GRAY && (s->flags&CODEC_FLAG_GRAY));
6664
6665     if( h->pps.cabac ) {
6666         int i;
6667
6668         /* realign */
6669         align_get_bits( &s->gb );
6670
6671         /* init cabac */
6672         ff_init_cabac_states( &h->cabac);
6673         ff_init_cabac_decoder( &h->cabac,
6674                                s->gb.buffer + get_bits_count(&s->gb)/8,
6675                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6676         /* calculate pre-state */
6677         for( i= 0; i < 460; i++ ) {
6678             int pre;
6679             if( h->slice_type_nos == FF_I_TYPE )
6680                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6681             else
6682                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6683
6684             if( pre <= 63 )
6685                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6686             else
6687                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6688         }
6689
6690         for(;;){
6691 //START_TIMER
6692             int ret = decode_mb_cabac(h);
6693             int eos;
6694 //STOP_TIMER("decode_mb_cabac")
6695
6696             if(ret>=0) hl_decode_mb(h);
6697
6698             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6699                 s->mb_y++;
6700
6701                 ret = decode_mb_cabac(h);
6702
6703                 if(ret>=0) hl_decode_mb(h);
6704                 s->mb_y--;
6705             }
6706             eos = get_cabac_terminate( &h->cabac );
6707
6708             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6709                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6710                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6711                 return -1;
6712             }
6713
6714             if( ++s->mb_x >= s->mb_width ) {
6715                 s->mb_x = 0;
6716                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6717                 ++s->mb_y;
6718                 if(FIELD_OR_MBAFF_PICTURE) {
6719                     ++s->mb_y;
6720                 }
6721             }
6722
6723             if( eos || s->mb_y >= s->mb_height ) {
6724                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6725                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6726                 return 0;
6727             }
6728         }
6729
6730     } else {
6731         for(;;){
6732             int ret = decode_mb_cavlc(h);
6733
6734             if(ret>=0) hl_decode_mb(h);
6735
6736             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6737                 s->mb_y++;
6738                 ret = decode_mb_cavlc(h);
6739
6740                 if(ret>=0) hl_decode_mb(h);
6741                 s->mb_y--;
6742             }
6743
6744             if(ret<0){
6745                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6746                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6747
6748                 return -1;
6749             }
6750
6751             if(++s->mb_x >= s->mb_width){
6752                 s->mb_x=0;
6753                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6754                 ++s->mb_y;
6755                 if(FIELD_OR_MBAFF_PICTURE) {
6756                     ++s->mb_y;
6757                 }
6758                 if(s->mb_y >= s->mb_height){
6759                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6760
6761                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6762                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6763
6764                         return 0;
6765                     }else{
6766                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6767
6768                         return -1;
6769                     }
6770                 }
6771             }
6772
6773             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6774                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6775                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6776                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6777
6778                     return 0;
6779                 }else{
6780                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6781
6782                     return -1;
6783                 }
6784             }
6785         }
6786     }
6787
6788 #if 0
6789     for(;s->mb_y < s->mb_height; s->mb_y++){
6790         for(;s->mb_x < s->mb_width; s->mb_x++){
6791             int ret= decode_mb(h);
6792
6793             hl_decode_mb(h);
6794
6795             if(ret<0){
6796                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6797                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6798
6799                 return -1;
6800             }
6801
6802             if(++s->mb_x >= s->mb_width){
6803                 s->mb_x=0;
6804                 if(++s->mb_y >= s->mb_height){
6805                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6806                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6807
6808                         return 0;
6809                     }else{
6810                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6811
6812                         return -1;
6813                     }
6814                 }
6815             }
6816
6817             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6818                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6819                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6820
6821                     return 0;
6822                 }else{
6823                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6824
6825                     return -1;
6826                 }
6827             }
6828         }
6829         s->mb_x=0;
6830         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6831     }
6832 #endif
6833     return -1; //not reached
6834 }
6835
6836 static int decode_picture_timing(H264Context *h){
6837     MpegEncContext * const s = &h->s;
6838     if(h->sps.nal_hrd_parameters_present_flag || h->sps.vcl_hrd_parameters_present_flag){
6839         h->sei_cpb_removal_delay = get_bits(&s->gb, h->sps.cpb_removal_delay_length);
6840         h->sei_dpb_output_delay = get_bits(&s->gb, h->sps.dpb_output_delay_length);
6841     }
6842     if(h->sps.pic_struct_present_flag){
6843         unsigned int i, num_clock_ts;
6844         h->sei_pic_struct = get_bits(&s->gb, 4);
6845         h->sei_ct_type    = 0;
6846
6847         if (h->sei_pic_struct > SEI_PIC_STRUCT_FRAME_TRIPLING)
6848             return -1;
6849
6850         num_clock_ts = sei_num_clock_ts_table[h->sei_pic_struct];
6851
6852         for (i = 0 ; i < num_clock_ts ; i++){
6853             if(get_bits(&s->gb, 1)){                  /* clock_timestamp_flag */
6854                 unsigned int full_timestamp_flag;
6855                 h->sei_ct_type |= 1<<get_bits(&s->gb, 2);
6856                 skip_bits(&s->gb, 1);                 /* nuit_field_based_flag */
6857                 skip_bits(&s->gb, 5);                 /* counting_type */
6858                 full_timestamp_flag = get_bits(&s->gb, 1);
6859                 skip_bits(&s->gb, 1);                 /* discontinuity_flag */
6860                 skip_bits(&s->gb, 1);                 /* cnt_dropped_flag */
6861                 skip_bits(&s->gb, 8);                 /* n_frames */
6862                 if(full_timestamp_flag){
6863                     skip_bits(&s->gb, 6);             /* seconds_value 0..59 */
6864                     skip_bits(&s->gb, 6);             /* minutes_value 0..59 */
6865                     skip_bits(&s->gb, 5);             /* hours_value 0..23 */
6866                 }else{
6867                     if(get_bits(&s->gb, 1)){          /* seconds_flag */
6868                         skip_bits(&s->gb, 6);         /* seconds_value range 0..59 */
6869                         if(get_bits(&s->gb, 1)){      /* minutes_flag */
6870                             skip_bits(&s->gb, 6);     /* minutes_value 0..59 */
6871                             if(get_bits(&s->gb, 1))   /* hours_flag */
6872                                 skip_bits(&s->gb, 5); /* hours_value 0..23 */
6873                         }
6874                     }
6875                 }
6876                 if(h->sps.time_offset_length > 0)
6877                     skip_bits(&s->gb, h->sps.time_offset_length); /* time_offset */
6878             }
6879         }
6880     }
6881     return 0;
6882 }
6883
6884 static int decode_unregistered_user_data(H264Context *h, int size){
6885     MpegEncContext * const s = &h->s;
6886     uint8_t user_data[16+256];
6887     int e, build, i;
6888
6889     if(size<16)
6890         return -1;
6891
6892     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6893         user_data[i]= get_bits(&s->gb, 8);
6894     }
6895
6896     user_data[i]= 0;
6897     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6898     if(e==1 && build>=0)
6899         h->x264_build= build;
6900
6901     if(s->avctx->debug & FF_DEBUG_BUGS)
6902         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6903
6904     for(; i<size; i++)
6905         skip_bits(&s->gb, 8);
6906
6907     return 0;
6908 }
6909
6910 static int decode_recovery_point(H264Context *h){
6911     MpegEncContext * const s = &h->s;
6912
6913     h->sei_recovery_frame_cnt = get_ue_golomb(&s->gb);
6914     skip_bits(&s->gb, 4);       /* 1b exact_match_flag, 1b broken_link_flag, 2b changing_slice_group_idc */
6915
6916     return 0;
6917 }
6918
6919 static int decode_buffering_period(H264Context *h){
6920     MpegEncContext * const s = &h->s;
6921     unsigned int sps_id;
6922     int sched_sel_idx;
6923     SPS *sps;
6924
6925     sps_id = get_ue_golomb_31(&s->gb);
6926     if(sps_id > 31 || !h->sps_buffers[sps_id]) {
6927         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS %d referenced in buffering period\n", sps_id);
6928         return -1;
6929     }
6930     sps = h->sps_buffers[sps_id];
6931
6932     // NOTE: This is really so duplicated in the standard... See H.264, D.1.1
6933     if (sps->nal_hrd_parameters_present_flag) {
6934         for (sched_sel_idx = 0; sched_sel_idx < sps->cpb_cnt; sched_sel_idx++) {
6935             h->initial_cpb_removal_delay[sched_sel_idx] = get_bits(&s->gb, sps->initial_cpb_removal_delay_length);
6936             skip_bits(&s->gb, sps->initial_cpb_removal_delay_length); // initial_cpb_removal_delay_offset
6937         }
6938     }
6939     if (sps->vcl_hrd_parameters_present_flag) {
6940         for (sched_sel_idx = 0; sched_sel_idx < sps->cpb_cnt; sched_sel_idx++) {
6941             h->initial_cpb_removal_delay[sched_sel_idx] = get_bits(&s->gb, sps->initial_cpb_removal_delay_length);
6942             skip_bits(&s->gb, sps->initial_cpb_removal_delay_length); // initial_cpb_removal_delay_offset
6943         }
6944     }
6945
6946     h->sei_buffering_period_present = 1;
6947     return 0;
6948 }
6949
6950 int ff_h264_decode_sei(H264Context *h){
6951     MpegEncContext * const s = &h->s;
6952
6953     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6954         int size, type;
6955
6956         type=0;
6957         do{
6958             type+= show_bits(&s->gb, 8);
6959         }while(get_bits(&s->gb, 8) == 255);
6960
6961         size=0;
6962         do{
6963             size+= show_bits(&s->gb, 8);
6964         }while(get_bits(&s->gb, 8) == 255);
6965
6966         switch(type){
6967         case SEI_TYPE_PIC_TIMING: // Picture timing SEI
6968             if(decode_picture_timing(h) < 0)
6969                 return -1;
6970             break;
6971         case SEI_TYPE_USER_DATA_UNREGISTERED:
6972             if(decode_unregistered_user_data(h, size) < 0)
6973                 return -1;
6974             break;
6975         case SEI_TYPE_RECOVERY_POINT:
6976             if(decode_recovery_point(h) < 0)
6977                 return -1;
6978             break;
6979         case SEI_BUFFERING_PERIOD:
6980             if(decode_buffering_period(h) < 0)
6981                 return -1;
6982             break;
6983         default:
6984             skip_bits(&s->gb, 8*size);
6985         }
6986
6987         //FIXME check bits here
6988         align_get_bits(&s->gb);
6989     }
6990
6991     return 0;
6992 }
6993
6994 static inline int decode_hrd_parameters(H264Context *h, SPS *sps){
6995     MpegEncContext * const s = &h->s;
6996     int cpb_count, i;
6997     cpb_count = get_ue_golomb_31(&s->gb) + 1;
6998
6999     if(cpb_count > 32U){
7000         av_log(h->s.avctx, AV_LOG_ERROR, "cpb_count %d invalid\n", cpb_count);
7001         return -1;
7002     }
7003
7004     get_bits(&s->gb, 4); /* bit_rate_scale */
7005     get_bits(&s->gb, 4); /* cpb_size_scale */
7006     for(i=0; i<cpb_count; i++){
7007         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
7008         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
7009         get_bits1(&s->gb);     /* cbr_flag */
7010     }
7011     sps->initial_cpb_removal_delay_length = get_bits(&s->gb, 5) + 1;
7012     sps->cpb_removal_delay_length = get_bits(&s->gb, 5) + 1;
7013     sps->dpb_output_delay_length = get_bits(&s->gb, 5) + 1;
7014     sps->time_offset_length = get_bits(&s->gb, 5);
7015     sps->cpb_cnt = cpb_count;
7016     return 0;
7017 }
7018
7019 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
7020     MpegEncContext * const s = &h->s;
7021     int aspect_ratio_info_present_flag;
7022     unsigned int aspect_ratio_idc;
7023
7024     aspect_ratio_info_present_flag= get_bits1(&s->gb);
7025
7026     if( aspect_ratio_info_present_flag ) {
7027         aspect_ratio_idc= get_bits(&s->gb, 8);
7028         if( aspect_ratio_idc == EXTENDED_SAR ) {
7029             sps->sar.num= get_bits(&s->gb, 16);
7030             sps->sar.den= get_bits(&s->gb, 16);
7031         }else if(aspect_ratio_idc < FF_ARRAY_ELEMS(pixel_aspect)){
7032             sps->sar=  pixel_aspect[aspect_ratio_idc];
7033         }else{
7034             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
7035             return -1;
7036         }
7037     }else{
7038         sps->sar.num=
7039         sps->sar.den= 0;
7040     }
7041 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
7042
7043     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
7044         get_bits1(&s->gb);      /* overscan_appropriate_flag */
7045     }
7046
7047     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
7048         get_bits(&s->gb, 3);    /* video_format */
7049         get_bits1(&s->gb);      /* video_full_range_flag */
7050         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
7051             get_bits(&s->gb, 8); /* colour_primaries */
7052             get_bits(&s->gb, 8); /* transfer_characteristics */
7053             get_bits(&s->gb, 8); /* matrix_coefficients */
7054         }
7055     }
7056
7057     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
7058         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
7059         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
7060     }
7061
7062     sps->timing_info_present_flag = get_bits1(&s->gb);
7063     if(sps->timing_info_present_flag){
7064         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
7065         sps->time_scale = get_bits_long(&s->gb, 32);
7066         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
7067     }
7068
7069     sps->nal_hrd_parameters_present_flag = get_bits1(&s->gb);
7070     if(sps->nal_hrd_parameters_present_flag)
7071         if(decode_hrd_parameters(h, sps) < 0)
7072             return -1;
7073     sps->vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
7074     if(sps->vcl_hrd_parameters_present_flag)
7075         if(decode_hrd_parameters(h, sps) < 0)
7076             return -1;
7077     if(sps->nal_hrd_parameters_present_flag || sps->vcl_hrd_parameters_present_flag)
7078         get_bits1(&s->gb);     /* low_delay_hrd_flag */
7079     sps->pic_struct_present_flag = get_bits1(&s->gb);
7080
7081     sps->bitstream_restriction_flag = get_bits1(&s->gb);
7082     if(sps->bitstream_restriction_flag){
7083         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
7084         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
7085         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
7086         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
7087         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
7088         sps->num_reorder_frames= get_ue_golomb(&s->gb);
7089         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
7090
7091         if(sps->num_reorder_frames > 16U /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
7092             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", sps->num_reorder_frames);
7093             return -1;
7094         }
7095     }
7096
7097     return 0;
7098 }
7099
7100 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
7101                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
7102     MpegEncContext * const s = &h->s;
7103     int i, last = 8, next = 8;
7104     const uint8_t *scan = size == 16 ? zigzag_scan : ff_zigzag_direct;
7105     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
7106         memcpy(factors, fallback_list, size*sizeof(uint8_t));
7107     else
7108     for(i=0;i<size;i++){
7109         if(next)
7110             next = (last + get_se_golomb(&s->gb)) & 0xff;
7111         if(!i && !next){ /* matrix not written, we use the preset one */
7112             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7113             break;
7114         }
7115         last = factors[scan[i]] = next ? next : last;
7116     }
7117 }
7118
7119 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7120                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7121     MpegEncContext * const s = &h->s;
7122     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7123     const uint8_t *fallback[4] = {
7124         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7125         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7126         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7127         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7128     };
7129     if(get_bits1(&s->gb)){
7130         sps->scaling_matrix_present |= is_sps;
7131         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7132         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7133         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7134         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7135         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7136         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7137         if(is_sps || pps->transform_8x8_mode){
7138             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7139             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7140         }
7141     }
7142 }
7143
7144 int ff_h264_decode_seq_parameter_set(H264Context *h){
7145     MpegEncContext * const s = &h->s;
7146     int profile_idc, level_idc;
7147     unsigned int sps_id;
7148     int i;
7149     SPS *sps;
7150
7151     profile_idc= get_bits(&s->gb, 8);
7152     get_bits1(&s->gb);   //constraint_set0_flag
7153     get_bits1(&s->gb);   //constraint_set1_flag
7154     get_bits1(&s->gb);   //constraint_set2_flag
7155     get_bits1(&s->gb);   //constraint_set3_flag
7156     get_bits(&s->gb, 4); // reserved
7157     level_idc= get_bits(&s->gb, 8);
7158     sps_id= get_ue_golomb_31(&s->gb);
7159
7160     if(sps_id >= MAX_SPS_COUNT) {
7161         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id (%d) out of range\n", sps_id);
7162         return -1;
7163     }
7164     sps= av_mallocz(sizeof(SPS));
7165     if(sps == NULL)
7166         return -1;
7167
7168     sps->profile_idc= profile_idc;
7169     sps->level_idc= level_idc;
7170
7171     memset(sps->scaling_matrix4, 16, sizeof(sps->scaling_matrix4));
7172     memset(sps->scaling_matrix8, 16, sizeof(sps->scaling_matrix8));
7173     sps->scaling_matrix_present = 0;
7174
7175     if(sps->profile_idc >= 100){ //high profile
7176         sps->chroma_format_idc= get_ue_golomb_31(&s->gb);
7177         if(sps->chroma_format_idc == 3)
7178             sps->residual_color_transform_flag = get_bits1(&s->gb);
7179         sps->bit_depth_luma   = get_ue_golomb(&s->gb) + 8;
7180         sps->bit_depth_chroma = get_ue_golomb(&s->gb) + 8;
7181         sps->transform_bypass = get_bits1(&s->gb);
7182         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7183     }else{
7184         sps->chroma_format_idc= 1;
7185     }
7186
7187     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7188     sps->poc_type= get_ue_golomb_31(&s->gb);
7189
7190     if(sps->poc_type == 0){ //FIXME #define
7191         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7192     } else if(sps->poc_type == 1){//FIXME #define
7193         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7194         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7195         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7196         sps->poc_cycle_length                = get_ue_golomb(&s->gb);
7197
7198         if((unsigned)sps->poc_cycle_length >= FF_ARRAY_ELEMS(sps->offset_for_ref_frame)){
7199             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", sps->poc_cycle_length);
7200             goto fail;
7201         }
7202
7203         for(i=0; i<sps->poc_cycle_length; i++)
7204             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7205     }else if(sps->poc_type != 2){
7206         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7207         goto fail;
7208     }
7209
7210     sps->ref_frame_count= get_ue_golomb_31(&s->gb);
7211     if(sps->ref_frame_count > MAX_PICTURE_COUNT-2 || sps->ref_frame_count >= 32U){
7212         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7213         goto fail;
7214     }
7215     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7216     sps->mb_width = get_ue_golomb(&s->gb) + 1;
7217     sps->mb_height= get_ue_golomb(&s->gb) + 1;
7218     if((unsigned)sps->mb_width >= INT_MAX/16 || (unsigned)sps->mb_height >= INT_MAX/16 ||
7219        avcodec_check_dimensions(NULL, 16*sps->mb_width, 16*sps->mb_height)){
7220         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7221         goto fail;
7222     }
7223
7224     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7225     if(!sps->frame_mbs_only_flag)
7226         sps->mb_aff= get_bits1(&s->gb);
7227     else
7228         sps->mb_aff= 0;
7229
7230     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7231
7232 #ifndef ALLOW_INTERLACE
7233     if(sps->mb_aff)
7234         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7235 #endif
7236     sps->crop= get_bits1(&s->gb);
7237     if(sps->crop){
7238         sps->crop_left  = get_ue_golomb(&s->gb);
7239         sps->crop_right = get_ue_golomb(&s->gb);
7240         sps->crop_top   = get_ue_golomb(&s->gb);
7241         sps->crop_bottom= get_ue_golomb(&s->gb);
7242         if(sps->crop_left || sps->crop_top){
7243             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7244         }
7245         if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !sps->frame_mbs_only_flag)){
7246             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
7247         }
7248     }else{
7249         sps->crop_left  =
7250         sps->crop_right =
7251         sps->crop_top   =
7252         sps->crop_bottom= 0;
7253     }
7254
7255     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7256     if( sps->vui_parameters_present_flag )
7257         decode_vui_parameters(h, sps);
7258
7259     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7260         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s %s %d/%d\n",
7261                sps_id, sps->profile_idc, sps->level_idc,
7262                sps->poc_type,
7263                sps->ref_frame_count,
7264                sps->mb_width, sps->mb_height,
7265                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7266                sps->direct_8x8_inference_flag ? "8B8" : "",
7267                sps->crop_left, sps->crop_right,
7268                sps->crop_top, sps->crop_bottom,
7269                sps->vui_parameters_present_flag ? "VUI" : "",
7270                ((const char*[]){"Gray","420","422","444"})[sps->chroma_format_idc],
7271                sps->timing_info_present_flag ? sps->num_units_in_tick : 0,
7272                sps->timing_info_present_flag ? sps->time_scale : 0
7273                );
7274     }
7275
7276     av_free(h->sps_buffers[sps_id]);
7277     h->sps_buffers[sps_id]= sps;
7278     h->sps = *sps;
7279     return 0;
7280 fail:
7281     av_free(sps);
7282     return -1;
7283 }
7284
7285 static void
7286 build_qp_table(PPS *pps, int t, int index)
7287 {
7288     int i;
7289     for(i = 0; i < 52; i++)
7290         pps->chroma_qp_table[t][i] = chroma_qp[av_clip(i + index, 0, 51)];
7291 }
7292
7293 int ff_h264_decode_picture_parameter_set(H264Context *h, int bit_length){
7294     MpegEncContext * const s = &h->s;
7295     unsigned int pps_id= get_ue_golomb(&s->gb);
7296     PPS *pps;
7297
7298     if(pps_id >= MAX_PPS_COUNT) {
7299         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id (%d) out of range\n", pps_id);
7300         return -1;
7301     }
7302
7303     pps= av_mallocz(sizeof(PPS));
7304     if(pps == NULL)
7305         return -1;
7306     pps->sps_id= get_ue_golomb_31(&s->gb);
7307     if((unsigned)pps->sps_id>=MAX_SPS_COUNT || h->sps_buffers[pps->sps_id] == NULL){
7308         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7309         goto fail;
7310     }
7311
7312     pps->cabac= get_bits1(&s->gb);
7313     pps->pic_order_present= get_bits1(&s->gb);
7314     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7315     if(pps->slice_group_count > 1 ){
7316         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7317         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7318         switch(pps->mb_slice_group_map_type){
7319         case 0:
7320 #if 0
7321 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7322 |    run_length[ i ]                                |1  |ue(v)   |
7323 #endif
7324             break;
7325         case 2:
7326 #if 0
7327 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7328 |{                                                  |   |        |
7329 |    top_left_mb[ i ]                               |1  |ue(v)   |
7330 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7331 |   }                                               |   |        |
7332 #endif
7333             break;
7334         case 3:
7335         case 4:
7336         case 5:
7337 #if 0
7338 |   slice_group_change_direction_flag               |1  |u(1)    |
7339 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7340 #endif
7341             break;
7342         case 6:
7343 #if 0
7344 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7345 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7346 |)                                                  |   |        |
7347 |    slice_group_id[ i ]                            |1  |u(v)    |
7348 #endif
7349             break;
7350         }
7351     }
7352     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7353     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7354     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7355         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7356         goto fail;
7357     }
7358
7359     pps->weighted_pred= get_bits1(&s->gb);
7360     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7361     pps->init_qp= get_se_golomb(&s->gb) + 26;
7362     pps->init_qs= get_se_golomb(&s->gb) + 26;
7363     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7364     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7365     pps->constrained_intra_pred= get_bits1(&s->gb);
7366     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7367
7368     pps->transform_8x8_mode= 0;
7369     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7370     memcpy(pps->scaling_matrix4, h->sps_buffers[pps->sps_id]->scaling_matrix4, sizeof(pps->scaling_matrix4));
7371     memcpy(pps->scaling_matrix8, h->sps_buffers[pps->sps_id]->scaling_matrix8, sizeof(pps->scaling_matrix8));
7372
7373     if(get_bits_count(&s->gb) < bit_length){
7374         pps->transform_8x8_mode= get_bits1(&s->gb);
7375         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7376         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7377     } else {
7378         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7379     }
7380
7381     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7382     build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7383     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1])
7384         h->pps.chroma_qp_diff= 1;
7385
7386     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7387         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7388                pps_id, pps->sps_id,
7389                pps->cabac ? "CABAC" : "CAVLC",
7390                pps->slice_group_count,
7391                pps->ref_count[0], pps->ref_count[1],
7392                pps->weighted_pred ? "weighted" : "",
7393                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7394                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7395                pps->constrained_intra_pred ? "CONSTR" : "",
7396                pps->redundant_pic_cnt_present ? "REDU" : "",
7397                pps->transform_8x8_mode ? "8x8DCT" : ""
7398                );
7399     }
7400
7401     av_free(h->pps_buffers[pps_id]);
7402     h->pps_buffers[pps_id]= pps;
7403     return 0;
7404 fail:
7405     av_free(pps);
7406     return -1;
7407 }
7408
7409 /**
7410  * Call decode_slice() for each context.
7411  *
7412  * @param h h264 master context
7413  * @param context_count number of contexts to execute
7414  */
7415 static void execute_decode_slices(H264Context *h, int context_count){
7416     MpegEncContext * const s = &h->s;
7417     AVCodecContext * const avctx= s->avctx;
7418     H264Context *hx;
7419     int i;
7420
7421     if (s->avctx->hwaccel)
7422         return;
7423     if(s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
7424         return;
7425     if(context_count == 1) {
7426         decode_slice(avctx, &h);
7427     } else {
7428         for(i = 1; i < context_count; i++) {
7429             hx = h->thread_context[i];
7430             hx->s.error_recognition = avctx->error_recognition;
7431             hx->s.error_count = 0;
7432         }
7433
7434         avctx->execute(avctx, (void *)decode_slice,
7435                        (void **)h->thread_context, NULL, context_count, sizeof(void*));
7436
7437         /* pull back stuff from slices to master context */
7438         hx = h->thread_context[context_count - 1];
7439         s->mb_x = hx->s.mb_x;
7440         s->mb_y = hx->s.mb_y;
7441         s->dropable = hx->s.dropable;
7442         s->picture_structure = hx->s.picture_structure;
7443         for(i = 1; i < context_count; i++)
7444             h->s.error_count += h->thread_context[i]->s.error_count;
7445     }
7446 }
7447
7448
7449 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7450     MpegEncContext * const s = &h->s;
7451     AVCodecContext * const avctx= s->avctx;
7452     int buf_index=0;
7453     H264Context *hx; ///< thread context
7454     int context_count = 0;
7455
7456     h->max_contexts = avctx->thread_count;
7457 #if 0
7458     int i;
7459     for(i=0; i<50; i++){
7460         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7461     }
7462 #endif
7463     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7464         h->current_slice = 0;
7465         if (!s->first_field)
7466             s->current_picture_ptr= NULL;
7467         reset_sei(h);
7468     }
7469
7470     for(;;){
7471         int consumed;
7472         int dst_length;
7473         int bit_length;
7474         const uint8_t *ptr;
7475         int i, nalsize = 0;
7476         int err;
7477
7478         if(h->is_avc) {
7479             if(buf_index >= buf_size) break;
7480             nalsize = 0;
7481             for(i = 0; i < h->nal_length_size; i++)
7482                 nalsize = (nalsize << 8) | buf[buf_index++];
7483             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7484                 if(nalsize == 1){
7485                     buf_index++;
7486                     continue;
7487                 }else{
7488                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7489                     break;
7490                 }
7491             }
7492         } else {
7493             // start code prefix search
7494             for(; buf_index + 3 < buf_size; buf_index++){
7495                 // This should always succeed in the first iteration.
7496                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7497                     break;
7498             }
7499
7500             if(buf_index+3 >= buf_size) break;
7501
7502             buf_index+=3;
7503         }
7504
7505         hx = h->thread_context[context_count];
7506
7507         ptr= ff_h264_decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7508         if (ptr==NULL || dst_length < 0){
7509             return -1;
7510         }
7511         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7512             dst_length--;
7513         bit_length= !dst_length ? 0 : (8*dst_length - ff_h264_decode_rbsp_trailing(h, ptr + dst_length - 1));
7514
7515         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7516             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7517         }
7518
7519         if (h->is_avc && (nalsize != consumed)){
7520             int i, debug_level = AV_LOG_DEBUG;
7521             for (i = consumed; i < nalsize; i++)
7522                 if (buf[buf_index+i])
7523                     debug_level = AV_LOG_ERROR;
7524             av_log(h->s.avctx, debug_level, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7525             consumed= nalsize;
7526         }
7527
7528         buf_index += consumed;
7529
7530         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7531            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7532             continue;
7533
7534       again:
7535         err = 0;
7536         switch(hx->nal_unit_type){
7537         case NAL_IDR_SLICE:
7538             if (h->nal_unit_type != NAL_IDR_SLICE) {
7539                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7540                 return -1;
7541             }
7542             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7543         case NAL_SLICE:
7544             init_get_bits(&hx->s.gb, ptr, bit_length);
7545             hx->intra_gb_ptr=
7546             hx->inter_gb_ptr= &hx->s.gb;
7547             hx->s.data_partitioning = 0;
7548
7549             if((err = decode_slice_header(hx, h)))
7550                break;
7551
7552             if (s->avctx->hwaccel && h->current_slice == 1) {
7553                 if (s->avctx->hwaccel->start_frame(s->avctx, NULL, 0) < 0)
7554                     return -1;
7555             }
7556
7557             s->current_picture_ptr->key_frame |=
7558                     (hx->nal_unit_type == NAL_IDR_SLICE) ||
7559                     (h->sei_recovery_frame_cnt >= 0);
7560             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7561                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7562                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7563                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7564                && avctx->skip_frame < AVDISCARD_ALL){
7565                 if(avctx->hwaccel) {
7566                     if (avctx->hwaccel->decode_slice(avctx, &buf[buf_index - consumed], consumed) < 0)
7567                         return -1;
7568                 }else
7569                 if(CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU){
7570                     static const uint8_t start_code[] = {0x00, 0x00, 0x01};
7571                     ff_vdpau_add_data_chunk(s, start_code, sizeof(start_code));
7572                     ff_vdpau_add_data_chunk(s, &buf[buf_index - consumed], consumed );
7573                 }else
7574                     context_count++;
7575             }
7576             break;
7577         case NAL_DPA:
7578             init_get_bits(&hx->s.gb, ptr, bit_length);
7579             hx->intra_gb_ptr=
7580             hx->inter_gb_ptr= NULL;
7581             hx->s.data_partitioning = 1;
7582
7583             err = decode_slice_header(hx, h);
7584             break;
7585         case NAL_DPB:
7586             init_get_bits(&hx->intra_gb, ptr, bit_length);
7587             hx->intra_gb_ptr= &hx->intra_gb;
7588             break;
7589         case NAL_DPC:
7590             init_get_bits(&hx->inter_gb, ptr, bit_length);
7591             hx->inter_gb_ptr= &hx->inter_gb;
7592
7593             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7594                && s->context_initialized
7595                && s->hurry_up < 5
7596                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7597                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7598                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7599                && avctx->skip_frame < AVDISCARD_ALL)
7600                 context_count++;
7601             break;
7602         case NAL_SEI:
7603             init_get_bits(&s->gb, ptr, bit_length);
7604             ff_h264_decode_sei(h);
7605             break;
7606         case NAL_SPS:
7607             init_get_bits(&s->gb, ptr, bit_length);
7608             ff_h264_decode_seq_parameter_set(h);
7609
7610             if(s->flags& CODEC_FLAG_LOW_DELAY)
7611                 s->low_delay=1;
7612
7613             if(avctx->has_b_frames < 2)
7614                 avctx->has_b_frames= !s->low_delay;
7615             break;
7616         case NAL_PPS:
7617             init_get_bits(&s->gb, ptr, bit_length);
7618
7619             ff_h264_decode_picture_parameter_set(h, bit_length);
7620
7621             break;
7622         case NAL_AUD:
7623         case NAL_END_SEQUENCE:
7624         case NAL_END_STREAM:
7625         case NAL_FILLER_DATA:
7626         case NAL_SPS_EXT:
7627         case NAL_AUXILIARY_SLICE:
7628             break;
7629         default:
7630             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7631         }
7632
7633         if(context_count == h->max_contexts) {
7634             execute_decode_slices(h, context_count);
7635             context_count = 0;
7636         }
7637
7638         if (err < 0)
7639             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7640         else if(err == 1) {
7641             /* Slice could not be decoded in parallel mode, copy down
7642              * NAL unit stuff to context 0 and restart. Note that
7643              * rbsp_buffer is not transferred, but since we no longer
7644              * run in parallel mode this should not be an issue. */
7645             h->nal_unit_type = hx->nal_unit_type;
7646             h->nal_ref_idc   = hx->nal_ref_idc;
7647             hx = h;
7648             goto again;
7649         }
7650     }
7651     if(context_count)
7652         execute_decode_slices(h, context_count);
7653     return buf_index;
7654 }
7655
7656 /**
7657  * returns the number of bytes consumed for building the current frame
7658  */
7659 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7660         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7661         if(pos+10>buf_size) pos=buf_size; // oops ;)
7662
7663         return pos;
7664 }
7665
7666 static int decode_frame(AVCodecContext *avctx,
7667                              void *data, int *data_size,
7668                              AVPacket *avpkt)
7669 {
7670     const uint8_t *buf = avpkt->data;
7671     int buf_size = avpkt->size;
7672     H264Context *h = avctx->priv_data;
7673     MpegEncContext *s = &h->s;
7674     AVFrame *pict = data;
7675     int buf_index;
7676
7677     s->flags= avctx->flags;
7678     s->flags2= avctx->flags2;
7679
7680    /* end of stream, output what is still in the buffers */
7681     if (buf_size == 0) {
7682         Picture *out;
7683         int i, out_idx;
7684
7685 //FIXME factorize this with the output code below
7686         out = h->delayed_pic[0];
7687         out_idx = 0;
7688         for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7689             if(h->delayed_pic[i]->poc < out->poc){
7690                 out = h->delayed_pic[i];
7691                 out_idx = i;
7692             }
7693
7694         for(i=out_idx; h->delayed_pic[i]; i++)
7695             h->delayed_pic[i] = h->delayed_pic[i+1];
7696
7697         if(out){
7698             *data_size = sizeof(AVFrame);
7699             *pict= *(AVFrame*)out;
7700         }
7701
7702         return 0;
7703     }
7704
7705     if(h->is_avc && !h->got_avcC) {
7706         int i, cnt, nalsize;
7707         unsigned char *p = avctx->extradata;
7708         if(avctx->extradata_size < 7) {
7709             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7710             return -1;
7711         }
7712         if(*p != 1) {
7713             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7714             return -1;
7715         }
7716         /* sps and pps in the avcC always have length coded with 2 bytes,
7717            so put a fake nal_length_size = 2 while parsing them */
7718         h->nal_length_size = 2;
7719         // Decode sps from avcC
7720         cnt = *(p+5) & 0x1f; // Number of sps
7721         p += 6;
7722         for (i = 0; i < cnt; i++) {
7723             nalsize = AV_RB16(p) + 2;
7724             if(decode_nal_units(h, p, nalsize) < 0) {
7725                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7726                 return -1;
7727             }
7728             p += nalsize;
7729         }
7730         // Decode pps from avcC
7731         cnt = *(p++); // Number of pps
7732         for (i = 0; i < cnt; i++) {
7733             nalsize = AV_RB16(p) + 2;
7734             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7735                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7736                 return -1;
7737             }
7738             p += nalsize;
7739         }
7740         // Now store right nal length size, that will be use to parse all other nals
7741         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7742         // Do not reparse avcC
7743         h->got_avcC = 1;
7744     }
7745
7746     if(!h->got_avcC && !h->is_avc && s->avctx->extradata_size){
7747         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7748             return -1;
7749         h->got_avcC = 1;
7750     }
7751
7752     buf_index=decode_nal_units(h, buf, buf_size);
7753     if(buf_index < 0)
7754         return -1;
7755
7756     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7757         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7758         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7759         return -1;
7760     }
7761
7762     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7763         Picture *out = s->current_picture_ptr;
7764         Picture *cur = s->current_picture_ptr;
7765         int i, pics, cross_idr, out_of_order, out_idx;
7766
7767         field_end(h);
7768
7769         if (cur->field_poc[0]==INT_MAX || cur->field_poc[1]==INT_MAX) {
7770             /* Wait for second field. */
7771             *data_size = 0;
7772
7773         } else {
7774             cur->repeat_pict = 0;
7775
7776             /* Signal interlacing information externally. */
7777             /* Prioritize picture timing SEI information over used decoding process if it exists. */
7778             if (h->sei_ct_type)
7779                 cur->interlaced_frame = (h->sei_ct_type & (1<<1)) != 0;
7780             else
7781                 cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7782
7783             if(h->sps.pic_struct_present_flag){
7784                 switch (h->sei_pic_struct)
7785                 {
7786                 case SEI_PIC_STRUCT_TOP_BOTTOM_TOP:
7787                 case SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM:
7788                     // Signal the possibility of telecined film externally (pic_struct 5,6)
7789                     // From these hints, let the applications decide if they apply deinterlacing.
7790                     cur->repeat_pict = 1;
7791                     break;
7792                 case SEI_PIC_STRUCT_FRAME_DOUBLING:
7793                     // Force progressive here, as doubling interlaced frame is a bad idea.
7794                     cur->interlaced_frame = 0;
7795                     cur->repeat_pict = 2;
7796                     break;
7797                 case SEI_PIC_STRUCT_FRAME_TRIPLING:
7798                     cur->interlaced_frame = 0;
7799                     cur->repeat_pict = 4;
7800                     break;
7801                 }
7802             }else{
7803                 /* Derive interlacing flag from used decoding process. */
7804                 cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7805             }
7806
7807             if (cur->field_poc[0] != cur->field_poc[1]){
7808                 /* Derive top_field_first from field pocs. */
7809                 cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7810             }else{
7811                 if(cur->interlaced_frame || h->sps.pic_struct_present_flag){
7812                     /* Use picture timing SEI information. Even if it is a information of a past frame, better than nothing. */
7813                     if(h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM
7814                       || h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM_TOP)
7815                         cur->top_field_first = 1;
7816                     else
7817                         cur->top_field_first = 0;
7818                 }else{
7819                     /* Most likely progressive */
7820                     cur->top_field_first = 0;
7821                 }
7822             }
7823
7824         //FIXME do something with unavailable reference frames
7825
7826             /* Sort B-frames into display order */
7827
7828             if(h->sps.bitstream_restriction_flag
7829                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7830                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7831                 s->low_delay = 0;
7832             }
7833
7834             if(   s->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT
7835                && !h->sps.bitstream_restriction_flag){
7836                 s->avctx->has_b_frames= MAX_DELAYED_PIC_COUNT;
7837                 s->low_delay= 0;
7838             }
7839
7840             pics = 0;
7841             while(h->delayed_pic[pics]) pics++;
7842
7843             assert(pics <= MAX_DELAYED_PIC_COUNT);
7844
7845             h->delayed_pic[pics++] = cur;
7846             if(cur->reference == 0)
7847                 cur->reference = DELAYED_PIC_REF;
7848
7849             out = h->delayed_pic[0];
7850             out_idx = 0;
7851             for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7852                 if(h->delayed_pic[i]->poc < out->poc){
7853                     out = h->delayed_pic[i];
7854                     out_idx = i;
7855                 }
7856             cross_idr = !h->delayed_pic[0]->poc || !!h->delayed_pic[i] || h->delayed_pic[0]->key_frame;
7857
7858             out_of_order = !cross_idr && out->poc < h->outputed_poc;
7859
7860             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7861                 { }
7862             else if((out_of_order && pics-1 == s->avctx->has_b_frames && s->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT)
7863                || (s->low_delay &&
7864                 ((!cross_idr && out->poc > h->outputed_poc + 2)
7865                  || cur->pict_type == FF_B_TYPE)))
7866             {
7867                 s->low_delay = 0;
7868                 s->avctx->has_b_frames++;
7869             }
7870
7871             if(out_of_order || pics > s->avctx->has_b_frames){
7872                 out->reference &= ~DELAYED_PIC_REF;
7873                 for(i=out_idx; h->delayed_pic[i]; i++)
7874                     h->delayed_pic[i] = h->delayed_pic[i+1];
7875             }
7876             if(!out_of_order && pics > s->avctx->has_b_frames){
7877                 *data_size = sizeof(AVFrame);
7878
7879                 h->outputed_poc = out->poc;
7880                 *pict= *(AVFrame*)out;
7881             }else{
7882                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7883             }
7884         }
7885     }
7886
7887     assert(pict->data[0] || !*data_size);
7888     ff_print_debug_info(s, pict);
7889 //printf("out %d\n", (int)pict->data[0]);
7890 #if 0 //?
7891
7892     /* Return the Picture timestamp as the frame number */
7893     /* we subtract 1 because it is added on utils.c     */
7894     avctx->frame_number = s->picture_number - 1;
7895 #endif
7896     return get_consumed_bytes(s, buf_index, buf_size);
7897 }
7898 #if 0
7899 static inline void fill_mb_avail(H264Context *h){
7900     MpegEncContext * const s = &h->s;
7901     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7902
7903     if(s->mb_y){
7904         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7905         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7906         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7907     }else{
7908         h->mb_avail[0]=
7909         h->mb_avail[1]=
7910         h->mb_avail[2]= 0;
7911     }
7912     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7913     h->mb_avail[4]= 1; //FIXME move out
7914     h->mb_avail[5]= 0; //FIXME move out
7915 }
7916 #endif
7917
7918 #ifdef TEST
7919 #undef printf
7920 #undef random
7921 #define COUNT 8000
7922 #define SIZE (COUNT*40)
7923 int main(void){
7924     int i;
7925     uint8_t temp[SIZE];
7926     PutBitContext pb;
7927     GetBitContext gb;
7928 //    int int_temp[10000];
7929     DSPContext dsp;
7930     AVCodecContext avctx;
7931
7932     dsputil_init(&dsp, &avctx);
7933
7934     init_put_bits(&pb, temp, SIZE);
7935     printf("testing unsigned exp golomb\n");
7936     for(i=0; i<COUNT; i++){
7937         START_TIMER
7938         set_ue_golomb(&pb, i);
7939         STOP_TIMER("set_ue_golomb");
7940     }
7941     flush_put_bits(&pb);
7942
7943     init_get_bits(&gb, temp, 8*SIZE);
7944     for(i=0; i<COUNT; i++){
7945         int j, s;
7946
7947         s= show_bits(&gb, 24);
7948
7949         START_TIMER
7950         j= get_ue_golomb(&gb);
7951         if(j != i){
7952             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7953 //            return -1;
7954         }
7955         STOP_TIMER("get_ue_golomb");
7956     }
7957
7958
7959     init_put_bits(&pb, temp, SIZE);
7960     printf("testing signed exp golomb\n");
7961     for(i=0; i<COUNT; i++){
7962         START_TIMER
7963         set_se_golomb(&pb, i - COUNT/2);
7964         STOP_TIMER("set_se_golomb");
7965     }
7966     flush_put_bits(&pb);
7967
7968     init_get_bits(&gb, temp, 8*SIZE);
7969     for(i=0; i<COUNT; i++){
7970         int j, s;
7971
7972         s= show_bits(&gb, 24);
7973
7974         START_TIMER
7975         j= get_se_golomb(&gb);
7976         if(j != i - COUNT/2){
7977             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7978 //            return -1;
7979         }
7980         STOP_TIMER("get_se_golomb");
7981     }
7982
7983 #if 0
7984     printf("testing 4x4 (I)DCT\n");
7985
7986     DCTELEM block[16];
7987     uint8_t src[16], ref[16];
7988     uint64_t error= 0, max_error=0;
7989
7990     for(i=0; i<COUNT; i++){
7991         int j;
7992 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
7993         for(j=0; j<16; j++){
7994             ref[j]= random()%255;
7995             src[j]= random()%255;
7996         }
7997
7998         h264_diff_dct_c(block, src, ref, 4);
7999
8000         //normalize
8001         for(j=0; j<16; j++){
8002 //            printf("%d ", block[j]);
8003             block[j]= block[j]*4;
8004             if(j&1) block[j]= (block[j]*4 + 2)/5;
8005             if(j&4) block[j]= (block[j]*4 + 2)/5;
8006         }
8007 //        printf("\n");
8008
8009         s->dsp.h264_idct_add(ref, block, 4);
8010 /*        for(j=0; j<16; j++){
8011             printf("%d ", ref[j]);
8012         }
8013         printf("\n");*/
8014
8015         for(j=0; j<16; j++){
8016             int diff= FFABS(src[j] - ref[j]);
8017
8018             error+= diff*diff;
8019             max_error= FFMAX(max_error, diff);
8020         }
8021     }
8022     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
8023     printf("testing quantizer\n");
8024     for(qp=0; qp<52; qp++){
8025         for(i=0; i<16; i++)
8026             src1_block[i]= src2_block[i]= random()%255;
8027
8028     }
8029     printf("Testing NAL layer\n");
8030
8031     uint8_t bitstream[COUNT];
8032     uint8_t nal[COUNT*2];
8033     H264Context h;
8034     memset(&h, 0, sizeof(H264Context));
8035
8036     for(i=0; i<COUNT; i++){
8037         int zeros= i;
8038         int nal_length;
8039         int consumed;
8040         int out_length;
8041         uint8_t *out;
8042         int j;
8043
8044         for(j=0; j<COUNT; j++){
8045             bitstream[j]= (random() % 255) + 1;
8046         }
8047
8048         for(j=0; j<zeros; j++){
8049             int pos= random() % COUNT;
8050             while(bitstream[pos] == 0){
8051                 pos++;
8052                 pos %= COUNT;
8053             }
8054             bitstream[pos]=0;
8055         }
8056
8057         START_TIMER
8058
8059         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
8060         if(nal_length<0){
8061             printf("encoding failed\n");
8062             return -1;
8063         }
8064
8065         out= ff_h264_decode_nal(&h, nal, &out_length, &consumed, nal_length);
8066
8067         STOP_TIMER("NAL")
8068
8069         if(out_length != COUNT){
8070             printf("incorrect length %d %d\n", out_length, COUNT);
8071             return -1;
8072         }
8073
8074         if(consumed != nal_length){
8075             printf("incorrect consumed length %d %d\n", nal_length, consumed);
8076             return -1;
8077         }
8078
8079         if(memcmp(bitstream, out, COUNT)){
8080             printf("mismatch\n");
8081             return -1;
8082         }
8083     }
8084 #endif
8085
8086     printf("Testing RBSP\n");
8087
8088
8089     return 0;
8090 }
8091 #endif /* TEST */
8092
8093
8094 av_cold void ff_h264_free_context(H264Context *h)
8095 {
8096     int i;
8097
8098     av_freep(&h->rbsp_buffer[0]);
8099     av_freep(&h->rbsp_buffer[1]);
8100     free_tables(h); //FIXME cleanup init stuff perhaps
8101
8102     for(i = 0; i < MAX_SPS_COUNT; i++)
8103         av_freep(h->sps_buffers + i);
8104
8105     for(i = 0; i < MAX_PPS_COUNT; i++)
8106         av_freep(h->pps_buffers + i);
8107 }
8108
8109 static av_cold int decode_end(AVCodecContext *avctx)
8110 {
8111     H264Context *h = avctx->priv_data;
8112     MpegEncContext *s = &h->s;
8113
8114     ff_h264_free_context(h);
8115
8116     MPV_common_end(s);
8117
8118 //    memset(h, 0, sizeof(H264Context));
8119
8120     return 0;
8121 }
8122
8123
8124 AVCodec h264_decoder = {
8125     "h264",
8126     CODEC_TYPE_VIDEO,
8127     CODEC_ID_H264,
8128     sizeof(H264Context),
8129     decode_init,
8130     NULL,
8131     decode_end,
8132     decode_frame,
8133     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_DELAY,
8134     .flush= flush_dpb,
8135     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
8136     .pix_fmts= ff_hwaccel_pixfmt_list_420,
8137 };
8138
8139 #if CONFIG_H264_VDPAU_DECODER
8140 AVCodec h264_vdpau_decoder = {
8141     "h264_vdpau",
8142     CODEC_TYPE_VIDEO,
8143     CODEC_ID_H264,
8144     sizeof(H264Context),
8145     decode_init,
8146     NULL,
8147     decode_end,
8148     decode_frame,
8149     CODEC_CAP_DR1 | CODEC_CAP_DELAY | CODEC_CAP_HWACCEL_VDPAU,
8150     .flush= flush_dpb,
8151     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10 (VDPAU acceleration)"),
8152 };
8153 #endif
8154
8155 #if CONFIG_SVQ3_DECODER
8156 #include "svq3.c"
8157 #endif