libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264.h"
  32 #include "h264data.h"
  33 #include "h264_parser.h"
  34 #include "golomb.h"
  35 #include "rectangle.h"
  36
  37 #include "cabac.h"
  38 #ifdef ARCH_X86
  39 #include "i386/h264_i386.h"
  40 #endif
  41
  42 //#undef NDEBUG
  43 #include <assert.h>
  44
  45 /**
  46  * Value of Picture.reference when Picture is not a reference picture, but
  47  * is held for delayed output.
  48  */
  49 #define DELAYED_PIC_REF 4
  50
  51 static VLC coeff_token_vlc[4];
  52 static VLC_TYPE coeff_token_vlc_tables[520+332+280+256][2];
  53 static const int coeff_token_vlc_tables_size[4]={520,332,280,256};
  54
  55 static VLC chroma_dc_coeff_token_vlc;
  56 static VLC_TYPE chroma_dc_coeff_token_vlc_table[256][2];
  57 static const int chroma_dc_coeff_token_vlc_table_size = 256;
  58
  59 static VLC total_zeros_vlc[15];
  60 static VLC_TYPE total_zeros_vlc_tables[15][512][2];
  61 static const int total_zeros_vlc_tables_size = 512;
  62
  63 static VLC chroma_dc_total_zeros_vlc[3];
  64 static VLC_TYPE chroma_dc_total_zeros_vlc_tables[3][8][2];
  65 static const int chroma_dc_total_zeros_vlc_tables_size = 8;
  66
  67 static VLC run_vlc[6];
  68 static VLC_TYPE run_vlc_tables[6][8][2];
  69 static const int run_vlc_tables_size = 8;
  70
  71 static VLC run7_vlc;
  72 static VLC_TYPE run7_vlc_table[96][2];
  73 static const int run7_vlc_table_size = 96;
  74
  75 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  76 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  77 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  78 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  79 static Picture * remove_long(H264Context *h, int i, int ref_mask);
  80
  81 static av_always_inline uint32_t pack16to32(int a, int b){
  82 #ifdef WORDS_BIGENDIAN
  83    return (b&0xFFFF) + (a<<16);
  84 #else
  85    return (a&0xFFFF) + (b<<16);
  86 #endif
  87 }
  88
  89 static const uint8_t rem6[52]={
  90 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  91 };
  92
  93 static const uint8_t div6[52]={
  94 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  95 };
  96
  97 static const int left_block_options[4][8]={
  98     {0,1,2,3,7,10,8,11},
  99     {2,2,3,3,8,11,8,11},
 100     {0,0,1,1,7,10,7,10},
 101     {0,2,0,2,7,10,7,10}
 102 };
 103
 104 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 105     MpegEncContext * const s = &h->s;
 106     const int mb_xy= h->mb_xy;
 107     int topleft_xy, top_xy, topright_xy, left_xy[2];
 108     int topleft_type, top_type, topright_type, left_type[2];
 109     const int * left_block;
 110     int topleft_partition= -1;
 111     int i;
 112
 113     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
 114
 115     //FIXME deblocking could skip the intra and nnz parts.
 116     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
 117         return;
 118
 119     /* Wow, what a mess, why didn't they simplify the interlacing & intra
 120      * stuff, I can't imagine that these complex rules are worth it. */
 121
 122     topleft_xy = top_xy - 1;
 123     topright_xy= top_xy + 1;
 124     left_xy[1] = left_xy[0] = mb_xy-1;
 125     left_block = left_block_options[0];
 126     if(FRAME_MBAFF){
 127         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 128         const int top_pair_xy      = pair_xy     - s->mb_stride;
 129         const int topleft_pair_xy  = top_pair_xy - 1;
 130         const int topright_pair_xy = top_pair_xy + 1;
 131         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 132         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 133         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 134         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 135         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 136         const int bottom = (s->mb_y & 1);
 137         tprintf(s->avctx, "fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 138         if (bottom
 139                 ? !curr_mb_frame_flag // bottom macroblock
 140                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 141                 ) {
 142             top_xy -= s->mb_stride;
 143         }
 144         if (bottom
 145                 ? !curr_mb_frame_flag // bottom macroblock
 146                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 147                 ) {
 148             topleft_xy -= s->mb_stride;
 149         } else if(bottom && curr_mb_frame_flag && !left_mb_frame_flag) {
 150             topleft_xy += s->mb_stride;
 151             // take top left mv from the middle of the mb, as opposed to all other modes which use the bottom right partition
 152             topleft_partition = 0;
 153         }
 154         if (bottom
 155                 ? !curr_mb_frame_flag // bottom macroblock
 156                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 157                 ) {
 158             topright_xy -= s->mb_stride;
 159         }
 160         if (left_mb_frame_flag != curr_mb_frame_flag) {
 161             left_xy[1] = left_xy[0] = pair_xy - 1;
 162             if (curr_mb_frame_flag) {
 163                 if (bottom) {
 164                     left_block = left_block_options[1];
 165                 } else {
 166                     left_block= left_block_options[2];
 167                 }
 168             } else {
 169                 left_xy[1] += s->mb_stride;
 170                 left_block = left_block_options[3];
 171             }
 172         }
 173     }
 174
 175     h->top_mb_xy = top_xy;
 176     h->left_mb_xy[0] = left_xy[0];
 177     h->left_mb_xy[1] = left_xy[1];
 178     if(for_deblock){
 179         topleft_type = 0;
 180         topright_type = 0;
 181         top_type     = h->slice_table[top_xy     ] < 0xFFFF ? s->current_picture.mb_type[top_xy]     : 0;
 182         left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0;
 183         left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0;
 184
 185         if(MB_MBAFF && !IS_INTRA(mb_type)){
 186             int list;
 187             for(list=0; list<h->list_count; list++){
 188                 //These values where changed for ease of performing MC, we need to change them back
 189                 //FIXME maybe we can make MC and loop filter use the same values or prevent
 190                 //the MC code from changing ref_cache and rather use a temporary array.
 191                 if(USES_LIST(mb_type,list)){
 192                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 193                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 194                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 195                     ref += h->b8_stride;
 196                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 197                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 198                 }
 199             }
 200         }
 201     }else{
 202         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 203         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 204         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 205         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 206         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 207
 208     if(IS_INTRA(mb_type)){
 209         int type_mask= h->pps.constrained_intra_pred ? IS_INTRA(-1) : -1;
 210         h->topleft_samples_available=
 211         h->top_samples_available=
 212         h->left_samples_available= 0xFFFF;
 213         h->topright_samples_available= 0xEEEA;
 214
 215         if(!(top_type & type_mask)){
 216             h->topleft_samples_available= 0xB3FF;
 217             h->top_samples_available= 0x33FF;
 218             h->topright_samples_available= 0x26EA;
 219         }
 220         if(IS_INTERLACED(mb_type) != IS_INTERLACED(left_type[0])){
 221             if(IS_INTERLACED(mb_type)){
 222                 if(!(left_type[0] & type_mask)){
 223                     h->topleft_samples_available&= 0xDFFF;
 224                     h->left_samples_available&= 0x5FFF;
 225                 }
 226                 if(!(left_type[1] & type_mask)){
 227                     h->topleft_samples_available&= 0xFF5F;
 228                     h->left_samples_available&= 0xFF5F;
 229                 }
 230             }else{
 231                 int left_typei = h->slice_table[left_xy[0] + s->mb_stride ] == h->slice_num
 232                                 ? s->current_picture.mb_type[left_xy[0] + s->mb_stride] : 0;
 233                 assert(left_xy[0] == left_xy[1]);
 234                 if(!((left_typei & type_mask) && (left_type[0] & type_mask))){
 235                     h->topleft_samples_available&= 0xDF5F;
 236                     h->left_samples_available&= 0x5F5F;
 237                 }
 238             }
 239         }else{
 240             if(!(left_type[0] & type_mask)){
 241                 h->topleft_samples_available&= 0xDF5F;
 242                 h->left_samples_available&= 0x5F5F;
 243             }
 244         }
 245
 246         if(!(topleft_type & type_mask))
 247             h->topleft_samples_available&= 0x7FFF;
 248
 249         if(!(topright_type & type_mask))
 250             h->topright_samples_available&= 0xFBFF;
 251
 252         if(IS_INTRA4x4(mb_type)){
 253             if(IS_INTRA4x4(top_type)){
 254                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 255                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 256                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 257                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 258             }else{
 259                 int pred;
 260                 if(!(top_type & type_mask))
 261                     pred= -1;
 262                 else{
 263                     pred= 2;
 264                 }
 265                 h->intra4x4_pred_mode_cache[4+8*0]=
 266                 h->intra4x4_pred_mode_cache[5+8*0]=
 267                 h->intra4x4_pred_mode_cache[6+8*0]=
 268                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 269             }
 270             for(i=0; i<2; i++){
 271                 if(IS_INTRA4x4(left_type[i])){
 272                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 273                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 274                 }else{
 275                     int pred;
 276                     if(!(left_type[i] & type_mask))
 277                         pred= -1;
 278                     else{
 279                         pred= 2;
 280                     }
 281                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 282                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 283                 }
 284             }
 285         }
 286     }
 287     }
 288
 289
 290 /*
 291 0 . T T. T T T T
 292 1 L . .L . . . .
 293 2 L . .L . . . .
 294 3 . T TL . . . .
 295 4 L . .L . . . .
 296 5 L . .. . . . .
 297 */
 298 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
 299     if(top_type){
 300         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 301         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 302         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 303         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 304
 305         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 306         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 307
 308         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 309         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 310
 311     }else{
 312         h->non_zero_count_cache[4+8*0]=
 313         h->non_zero_count_cache[5+8*0]=
 314         h->non_zero_count_cache[6+8*0]=
 315         h->non_zero_count_cache[7+8*0]=
 316
 317         h->non_zero_count_cache[1+8*0]=
 318         h->non_zero_count_cache[2+8*0]=
 319
 320         h->non_zero_count_cache[1+8*3]=
 321         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 322
 323     }
 324
 325     for (i=0; i<2; i++) {
 326         if(left_type[i]){
 327             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 328             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 329             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 330             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 331         }else{
 332             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 333             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 334             h->non_zero_count_cache[0+8*1 +   8*i]=
 335             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 336         }
 337     }
 338
 339     if( h->pps.cabac ) {
 340         // top_cbp
 341         if(top_type) {
 342             h->top_cbp = h->cbp_table[top_xy];
 343         } else if(IS_INTRA(mb_type)) {
 344             h->top_cbp = 0x1C0;
 345         } else {
 346             h->top_cbp = 0;
 347         }
 348         // left_cbp
 349         if (left_type[0]) {
 350             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 351         } else if(IS_INTRA(mb_type)) {
 352             h->left_cbp = 0x1C0;
 353         } else {
 354             h->left_cbp = 0;
 355         }
 356         if (left_type[0]) {
 357             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 358         }
 359         if (left_type[1]) {
 360             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 361         }
 362     }
 363
 364 #if 1
 365     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 366         int list;
 367         for(list=0; list<h->list_count; list++){
 368             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 369                 /*if(!h->mv_cache_clean[list]){
 370                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 371                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 372                     h->mv_cache_clean[list]= 1;
 373                 }*/
 374                 continue;
 375             }
 376             h->mv_cache_clean[list]= 0;
 377
 378             if(USES_LIST(top_type, list)){
 379                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 380                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 381                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 382                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 383                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 384                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 385                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 386                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 387                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 388                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 389             }else{
 390                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 391                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 392                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 393                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 394                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 395             }
 396
 397             for(i=0; i<2; i++){
 398                 int cache_idx = scan8[0] - 1 + i*2*8;
 399                 if(USES_LIST(left_type[i], list)){
 400                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 401                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 402                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 403                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 404                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 405                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 406                 }else{
 407                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 408                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 409                     h->ref_cache[list][cache_idx  ]=
 410                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 411                 }
 412             }
 413
 414             if(for_deblock || ((IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred) && !FRAME_MBAFF))
 415                 continue;
 416
 417             if(USES_LIST(topleft_type, list)){
 418                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 419                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 420                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 421                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 422             }else{
 423                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 424                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 425             }
 426
 427             if(USES_LIST(topright_type, list)){
 428                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 429                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 430                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 431                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 432             }else{
 433                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 434                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 435             }
 436
 437             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 438                 continue;
 439
 440             h->ref_cache[list][scan8[5 ]+1] =
 441             h->ref_cache[list][scan8[7 ]+1] =
 442             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 443             h->ref_cache[list][scan8[4 ]] =
 444             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 445             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 446             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 447             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 448             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 449             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 450
 451             if( h->pps.cabac ) {
 452                 /* XXX beurk, Load mvd */
 453                 if(USES_LIST(top_type, list)){
 454                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 455                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 456                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 457                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 458                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 459                 }else{
 460                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 461                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 462                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 463                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 464                 }
 465                 if(USES_LIST(left_type[0], list)){
 466                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 467                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 468                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 469                 }else{
 470                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 471                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 472                 }
 473                 if(USES_LIST(left_type[1], list)){
 474                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 475                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 476                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 477                 }else{
 478                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 479                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 480                 }
 481                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 482                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 483                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 484                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 485                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 486
 487                 if(h->slice_type_nos == FF_B_TYPE){
 488                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 489
 490                     if(IS_DIRECT(top_type)){
 491                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 492                     }else if(IS_8X8(top_type)){
 493                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 494                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 495                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 496                     }else{
 497                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 498                     }
 499
 500                     if(IS_DIRECT(left_type[0]))
 501                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 502                     else if(IS_8X8(left_type[0]))
 503                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 504                     else
 505                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 506
 507                     if(IS_DIRECT(left_type[1]))
 508                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 509                     else if(IS_8X8(left_type[1]))
 510                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 511                     else
 512                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 513                 }
 514             }
 515
 516             if(FRAME_MBAFF){
 517 #define MAP_MVS\
 518                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 519                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 520                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 521                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 522                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 523                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 524                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 525                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 526                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 527                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 528                 if(MB_FIELD){
 529 #define MAP_F2F(idx, mb_type)\
 530                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 531                         h->ref_cache[list][idx] <<= 1;\
 532                         h->mv_cache[list][idx][1] /= 2;\
 533                         h->mvd_cache[list][idx][1] /= 2;\
 534                     }
 535                     MAP_MVS
 536 #undef MAP_F2F
 537                 }else{
 538 #define MAP_F2F(idx, mb_type)\
 539                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 540                         h->ref_cache[list][idx] >>= 1;\
 541                         h->mv_cache[list][idx][1] <<= 1;\
 542                         h->mvd_cache[list][idx][1] <<= 1;\
 543                     }
 544                     MAP_MVS
 545 #undef MAP_F2F
 546                 }
 547             }
 548         }
 549     }
 550 #endif
 551
 552     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 553 }
 554
 555 static inline void write_back_intra_pred_mode(H264Context *h){
 556     const int mb_xy= h->mb_xy;
 557
 558     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 559     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 560     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 561     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 562     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 563     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 564     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 565 }
 566
 567 /**
 568  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 569  */
 570 static inline int check_intra4x4_pred_mode(H264Context *h){
 571     MpegEncContext * const s = &h->s;
 572     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 573     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 574     int i;
 575
 576     if(!(h->top_samples_available&0x8000)){
 577         for(i=0; i<4; i++){
 578             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 579             if(status<0){
 580                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 581                 return -1;
 582             } else if(status){
 583                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 584             }
 585         }
 586     }
 587
 588     if((h->left_samples_available&0x8888)!=0x8888){
 589         static const int mask[4]={0x8000,0x2000,0x80,0x20};
 590         for(i=0; i<4; i++){
 591             if(!(h->left_samples_available&mask[i])){
 592             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 593             if(status<0){
 594                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 595                 return -1;
 596             } else if(status){
 597                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 598             }
 599             }
 600         }
 601     }
 602
 603     return 0;
 604 } //FIXME cleanup like next
 605
 606 /**
 607  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 608  */
 609 static inline int check_intra_pred_mode(H264Context *h, int mode){
 610     MpegEncContext * const s = &h->s;
 611     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 612     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 613
 614     if(mode > 6U) {
 615         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 616         return -1;
 617     }
 618
 619     if(!(h->top_samples_available&0x8000)){
 620         mode= top[ mode ];
 621         if(mode<0){
 622             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 623             return -1;
 624         }
 625     }
 626
 627     if((h->left_samples_available&0x8080) != 0x8080){
 628         mode= left[ mode ];
 629         if(h->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred
 630             mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(h->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8);
 631         }
 632         if(mode<0){
 633             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 634             return -1;
 635         }
 636     }
 637
 638     return mode;
 639 }
 640
 641 /**
 642  * gets the predicted intra4x4 prediction mode.
 643  */
 644 static inline int pred_intra_mode(H264Context *h, int n){
 645     const int index8= scan8[n];
 646     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 647     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 648     const int min= FFMIN(left, top);
 649
 650     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 651
 652     if(min<0) return DC_PRED;
 653     else      return min;
 654 }
 655
 656 static inline void write_back_non_zero_count(H264Context *h){
 657     const int mb_xy= h->mb_xy;
 658
 659     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 660     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 661     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 662     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 663     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 664     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 665     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 666
 667     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 668     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 669     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 670
 671     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 672     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 673     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 674 }
 675
 676 /**
 677  * gets the predicted number of non-zero coefficients.
 678  * @param n block index
 679  */
 680 static inline int pred_non_zero_count(H264Context *h, int n){
 681     const int index8= scan8[n];
 682     const int left= h->non_zero_count_cache[index8 - 1];
 683     const int top = h->non_zero_count_cache[index8 - 8];
 684     int i= left + top;
 685
 686     if(i<64) i= (i+1)>>1;
 687
 688     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 689
 690     return i&31;
 691 }
 692
 693 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 694     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 695     MpegEncContext *s = &h->s;
 696
 697     /* there is no consistent mapping of mvs to neighboring locations that will
 698      * make mbaff happy, so we can't move all this logic to fill_caches */
 699     if(FRAME_MBAFF){
 700         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 701         const int16_t *mv;
 702         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 703         *C = h->mv_cache[list][scan8[0]-2];
 704
 705         if(!MB_FIELD
 706            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 707             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 708             if(IS_INTERLACED(mb_types[topright_xy])){
 709 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 710                 const int x4 = X4, y4 = Y4;\
 711                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 712                 if(!USES_LIST(mb_type,list))\
 713                     return LIST_NOT_USED;\
 714                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 715                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 716                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 717                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 718
 719                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 720             }
 721         }
 722         if(topright_ref == PART_NOT_AVAILABLE
 723            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 724            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 725             if(!MB_FIELD
 726                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 727                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 728             }
 729             if(MB_FIELD
 730                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 731                && i >= scan8[0]+8){
 732                 // left shift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's OK.
 733                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 734             }
 735         }
 736 #undef SET_DIAG_MV
 737     }
 738
 739     if(topright_ref != PART_NOT_AVAILABLE){
 740         *C= h->mv_cache[list][ i - 8 + part_width ];
 741         return topright_ref;
 742     }else{
 743         tprintf(s->avctx, "topright MV not available\n");
 744
 745         *C= h->mv_cache[list][ i - 8 - 1 ];
 746         return h->ref_cache[list][ i - 8 - 1 ];
 747     }
 748 }
 749
 750 /**
 751  * gets the predicted MV.
 752  * @param n the block index
 753  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 754  * @param mx the x component of the predicted motion vector
 755  * @param my the y component of the predicted motion vector
 756  */
 757 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 758     const int index8= scan8[n];
 759     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 760     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 761     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 762     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 763     const int16_t * C;
 764     int diagonal_ref, match_count;
 765
 766     assert(part_width==1 || part_width==2 || part_width==4);
 767
 768 /* mv_cache
 769   B . . A T T T T
 770   U . . L . . , .
 771   U . . L . . . .
 772   U . . L . . , .
 773   . . . L . . . .
 774 */
 775
 776     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 777     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 778     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 779     if(match_count > 1){ //most common
 780         *mx= mid_pred(A[0], B[0], C[0]);
 781         *my= mid_pred(A[1], B[1], C[1]);
 782     }else if(match_count==1){
 783         if(left_ref==ref){
 784             *mx= A[0];
 785             *my= A[1];
 786         }else if(top_ref==ref){
 787             *mx= B[0];
 788             *my= B[1];
 789         }else{
 790             *mx= C[0];
 791             *my= C[1];
 792         }
 793     }else{
 794         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 795             *mx= A[0];
 796             *my= A[1];
 797         }else{
 798             *mx= mid_pred(A[0], B[0], C[0]);
 799             *my= mid_pred(A[1], B[1], C[1]);
 800         }
 801     }
 802
 803     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 804 }
 805
 806 /**
 807  * gets the directionally predicted 16x8 MV.
 808  * @param n the block index
 809  * @param mx the x component of the predicted motion vector
 810  * @param my the y component of the predicted motion vector
 811  */
 812 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 813     if(n==0){
 814         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 815         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 816
 817         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 818
 819         if(top_ref == ref){
 820             *mx= B[0];
 821             *my= B[1];
 822             return;
 823         }
 824     }else{
 825         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 826         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 827
 828         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 829
 830         if(left_ref == ref){
 831             *mx= A[0];
 832             *my= A[1];
 833             return;
 834         }
 835     }
 836
 837     //RARE
 838     pred_motion(h, n, 4, list, ref, mx, my);
 839 }
 840
 841 /**
 842  * gets the directionally predicted 8x16 MV.
 843  * @param n the block index
 844  * @param mx the x component of the predicted motion vector
 845  * @param my the y component of the predicted motion vector
 846  */
 847 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 848     if(n==0){
 849         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 850         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 851
 852         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 853
 854         if(left_ref == ref){
 855             *mx= A[0];
 856             *my= A[1];
 857             return;
 858         }
 859     }else{
 860         const int16_t * C;
 861         int diagonal_ref;
 862
 863         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 864
 865         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 866
 867         if(diagonal_ref == ref){
 868             *mx= C[0];
 869             *my= C[1];
 870             return;
 871         }
 872     }
 873
 874     //RARE
 875     pred_motion(h, n, 2, list, ref, mx, my);
 876 }
 877
 878 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 879     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 880     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 881
 882     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 883
 884     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 885        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
 886        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
 887
 888         *mx = *my = 0;
 889         return;
 890     }
 891
 892     pred_motion(h, 0, 4, 0, 0, mx, my);
 893
 894     return;
 895 }
 896
 897 static int get_scale_factor(H264Context * const h, int poc, int poc1, int i){
 898     int poc0 = h->ref_list[0][i].poc;
 899     int td = av_clip(poc1 - poc0, -128, 127);
 900     if(td == 0 || h->ref_list[0][i].long_ref){
 901         return 256;
 902     }else{
 903         int tb = av_clip(poc - poc0, -128, 127);
 904         int tx = (16384 + (FFABS(td) >> 1)) / td;
 905         return av_clip((tb*tx + 32) >> 6, -1024, 1023);
 906     }
 907 }
 908
 909 static inline void direct_dist_scale_factor(H264Context * const h){
 910     MpegEncContext * const s = &h->s;
 911     const int poc = h->s.current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
 912     const int poc1 = h->ref_list[1][0].poc;
 913     int i, field;
 914     for(field=0; field<2; field++){
 915         const int poc  = h->s.current_picture_ptr->field_poc[field];
 916         const int poc1 = h->ref_list[1][0].field_poc[field];
 917         for(i=0; i < 2*h->ref_count[0]; i++)
 918             h->dist_scale_factor_field[field][i^field] = get_scale_factor(h, poc, poc1, i+16);
 919     }
 920
 921     for(i=0; i<h->ref_count[0]; i++){
 922         h->dist_scale_factor[i] = get_scale_factor(h, poc, poc1, i);
 923     }
 924 }
 925
 926 static void fill_colmap(H264Context *h, int map[2][16+32], int list, int field, int colfield, int mbafi){
 927     MpegEncContext * const s = &h->s;
 928     Picture * const ref1 = &h->ref_list[1][0];
 929     int j, old_ref, rfield;
 930     int start= mbafi ? 16                      : 0;
 931     int end  = mbafi ? 16+2*h->ref_count[list] : h->ref_count[list];
 932     int interl= mbafi || s->picture_structure != PICT_FRAME;
 933
 934     /* bogus; fills in for missing frames */
 935     memset(map[list], 0, sizeof(map[list]));
 936
 937     for(rfield=0; rfield<2; rfield++){
 938         for(old_ref=0; old_ref<ref1->ref_count[colfield][list]; old_ref++){
 939             int poc = ref1->ref_poc[colfield][list][old_ref];
 940
 941             if     (!interl)
 942                 poc |= 3;
 943             else if( interl && (poc&3) == 3) //FIXME store all MBAFF references so this isnt needed
 944                 poc= (poc&~3) + rfield + 1;
 945
 946             for(j=start; j<end; j++){
 947                 if(4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3) == poc){
 948                     int cur_ref= mbafi ? (j-16)^field : j;
 949                     map[list][2*old_ref + (rfield^field) + 16] = cur_ref;
 950                     if(rfield == field)
 951                         map[list][old_ref] = cur_ref;
 952                     break;
 953                 }
 954             }
 955         }
 956     }
 957 }
 958
 959 static inline void direct_ref_list_init(H264Context * const h){
 960     MpegEncContext * const s = &h->s;
 961     Picture * const ref1 = &h->ref_list[1][0];
 962     Picture * const cur = s->current_picture_ptr;
 963     int list, j, field;
 964     int sidx= (s->picture_structure&1)^1;
 965     int ref1sidx= (ref1->reference&1)^1;
 966
 967     for(list=0; list<2; list++){
 968         cur->ref_count[sidx][list] = h->ref_count[list];
 969         for(j=0; j<h->ref_count[list]; j++)
 970             cur->ref_poc[sidx][list][j] = 4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3);
 971     }
 972
 973     if(s->picture_structure == PICT_FRAME){
 974         memcpy(cur->ref_count[1], cur->ref_count[0], sizeof(cur->ref_count[0]));
 975         memcpy(cur->ref_poc  [1], cur->ref_poc  [0], sizeof(cur->ref_poc  [0]));
 976     }
 977
 978     cur->mbaff= FRAME_MBAFF;
 979
 980     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 981         return;
 982
 983     for(list=0; list<2; list++){
 984         fill_colmap(h, h->map_col_to_list0, list, sidx, ref1sidx, 0);
 985         for(field=0; field<2; field++)
 986             fill_colmap(h, h->map_col_to_list0_field[field], list, field, field, 1);
 987     }
 988 }
 989
 990 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 991     MpegEncContext * const s = &h->s;
 992     int b8_stride = h->b8_stride;
 993     int b4_stride = h->b_stride;
 994     int mb_xy = h->mb_xy;
 995     int mb_type_col[2];
 996     const int16_t (*l1mv0)[2], (*l1mv1)[2];
 997     const int8_t *l1ref0, *l1ref1;
 998     const int is_b8x8 = IS_8X8(*mb_type);
 999     unsigned int sub_mb_type;
1000     int i8, i4;
1001
1002 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
1003
1004     if(IS_INTERLACED(h->ref_list[1][0].mb_type[mb_xy])){ // AFL/AFR/FR/FL -> AFL/FL
1005         if(!IS_INTERLACED(*mb_type)){                    //     AFR/FR    -> AFL/FL
1006             int cur_poc = s->current_picture_ptr->poc;
1007             int *col_poc = h->ref_list[1]->field_poc;
1008             int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
1009             mb_xy= s->mb_x + ((s->mb_y&~1) + col_parity)*s->mb_stride;
1010             b8_stride = 0;
1011         }else if(!(s->picture_structure & h->ref_list[1][0].reference) && !h->ref_list[1][0].mbaff){// FL -> FL & differ parity
1012             int fieldoff= 2*(h->ref_list[1][0].reference)-3;
1013             mb_xy += s->mb_stride*fieldoff;
1014         }
1015         goto single_col;
1016     }else{                                               // AFL/AFR/FR/FL -> AFR/FR
1017         if(IS_INTERLACED(*mb_type)){                     // AFL       /FL -> AFR/FR
1018             mb_xy= s->mb_x + (s->mb_y&~1)*s->mb_stride;
1019             mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy];
1020             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride];
1021             b8_stride *= 3;
1022             b4_stride *= 6;
1023             //FIXME IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag
1024             if(    (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)
1025                 && (mb_type_col[1] & MB_TYPE_16x16_OR_INTRA)
1026                 && !is_b8x8){
1027                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1028                 *mb_type   |= MB_TYPE_16x8 |MB_TYPE_L0L1|MB_TYPE_DIRECT2; /* B_16x8 */
1029             }else{
1030                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1031                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1032             }
1033         }else{                                           //     AFR/FR    -> AFR/FR
1034 single_col:
1035             mb_type_col[0] =
1036             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy];
1037             if(IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag){
1038                 /* FIXME save sub mb types from previous frames (or derive from MVs)
1039                 * so we know exactly what block size to use */
1040                 sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1041                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1042             }else if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
1043                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1044                 *mb_type   |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1045             }else{
1046                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1047                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1048             }
1049         }
1050     }
1051
1052     l1mv0  = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]];
1053     l1mv1  = &h->ref_list[1][0].motion_val[1][h->mb2b_xy [mb_xy]];
1054     l1ref0 = &h->ref_list[1][0].ref_index [0][h->mb2b8_xy[mb_xy]];
1055     l1ref1 = &h->ref_list[1][0].ref_index [1][h->mb2b8_xy[mb_xy]];
1056     if(!b8_stride){
1057         if(s->mb_y&1){
1058             l1ref0 += h->b8_stride;
1059             l1ref1 += h->b8_stride;
1060             l1mv0  +=  2*b4_stride;
1061             l1mv1  +=  2*b4_stride;
1062         }
1063     }
1064
1065     if(h->direct_spatial_mv_pred){
1066         int ref[2];
1067         int mv[2][2];
1068         int list;
1069
1070         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1071
1072         /* ref = min(neighbors) */
1073         for(list=0; list<2; list++){
1074             int refa = h->ref_cache[list][scan8[0] - 1];
1075             int refb = h->ref_cache[list][scan8[0] - 8];
1076             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1077             if(refc == PART_NOT_AVAILABLE)
1078                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1079             ref[list] = FFMIN3((unsigned)refa, (unsigned)refb, (unsigned)refc);
1080             if(ref[list] < 0)
1081                 ref[list] = -1;
1082         }
1083
1084         if(ref[0] < 0 && ref[1] < 0){
1085             ref[0] = ref[1] = 0;
1086             mv[0][0] = mv[0][1] =
1087             mv[1][0] = mv[1][1] = 0;
1088         }else{
1089             for(list=0; list<2; list++){
1090                 if(ref[list] >= 0)
1091                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1092                 else
1093                     mv[list][0] = mv[list][1] = 0;
1094             }
1095         }
1096
1097         if(ref[1] < 0){
1098             if(!is_b8x8)
1099                 *mb_type &= ~MB_TYPE_L1;
1100             sub_mb_type &= ~MB_TYPE_L1;
1101         }else if(ref[0] < 0){
1102             if(!is_b8x8)
1103                 *mb_type &= ~MB_TYPE_L0;
1104             sub_mb_type &= ~MB_TYPE_L0;
1105         }
1106
1107         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1108             for(i8=0; i8<4; i8++){
1109                 int x8 = i8&1;
1110                 int y8 = i8>>1;
1111                 int xy8 = x8+y8*b8_stride;
1112                 int xy4 = 3*x8+y8*b4_stride;
1113                 int a=0, b=0;
1114
1115                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1116                     continue;
1117                 h->sub_mb_type[i8] = sub_mb_type;
1118
1119                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1120                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1121                 if(!IS_INTRA(mb_type_col[y8])
1122                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1123                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1124                     if(ref[0] > 0)
1125                         a= pack16to32(mv[0][0],mv[0][1]);
1126                     if(ref[1] > 0)
1127                         b= pack16to32(mv[1][0],mv[1][1]);
1128                 }else{
1129                     a= pack16to32(mv[0][0],mv[0][1]);
1130                     b= pack16to32(mv[1][0],mv[1][1]);
1131                 }
1132                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1133                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1134             }
1135         }else if(IS_16X16(*mb_type)){
1136             int a=0, b=0;
1137
1138             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1139             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1140             if(!IS_INTRA(mb_type_col[0])
1141                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1142                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1143                        && (h->x264_build>33 || !h->x264_build)))){
1144                 if(ref[0] > 0)
1145                     a= pack16to32(mv[0][0],mv[0][1]);
1146                 if(ref[1] > 0)
1147                     b= pack16to32(mv[1][0],mv[1][1]);
1148             }else{
1149                 a= pack16to32(mv[0][0],mv[0][1]);
1150                 b= pack16to32(mv[1][0],mv[1][1]);
1151             }
1152             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1153             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1154         }else{
1155             for(i8=0; i8<4; i8++){
1156                 const int x8 = i8&1;
1157                 const int y8 = i8>>1;
1158
1159                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1160                     continue;
1161                 h->sub_mb_type[i8] = sub_mb_type;
1162
1163                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1164                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1165                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1166                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1167
1168                 /* col_zero_flag */
1169                 if(!IS_INTRA(mb_type_col[0]) && (   l1ref0[x8 + y8*b8_stride] == 0
1170                                               || (l1ref0[x8 + y8*b8_stride] < 0 && l1ref1[x8 + y8*b8_stride] == 0
1171                                                   && (h->x264_build>33 || !h->x264_build)))){
1172                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*b8_stride] == 0 ? l1mv0 : l1mv1;
1173                     if(IS_SUB_8X8(sub_mb_type)){
1174                         const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1175                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1176                             if(ref[0] == 0)
1177                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1178                             if(ref[1] == 0)
1179                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1180                         }
1181                     }else
1182                     for(i4=0; i4<4; i4++){
1183                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1184                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1185                             if(ref[0] == 0)
1186                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1187                             if(ref[1] == 0)
1188                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1189                         }
1190                     }
1191                 }
1192             }
1193         }
1194     }else{ /* direct temporal mv pred */
1195         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1196         const int *dist_scale_factor = h->dist_scale_factor;
1197         int ref_offset= 0;
1198
1199         if(FRAME_MBAFF && IS_INTERLACED(*mb_type)){
1200             map_col_to_list0[0] = h->map_col_to_list0_field[s->mb_y&1][0];
1201             map_col_to_list0[1] = h->map_col_to_list0_field[s->mb_y&1][1];
1202             dist_scale_factor   =h->dist_scale_factor_field[s->mb_y&1];
1203         }
1204         if(h->ref_list[1][0].mbaff && IS_INTERLACED(mb_type_col[0]))
1205             ref_offset += 16;
1206
1207         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1208             /* FIXME assumes direct_8x8_inference == 1 */
1209             int y_shift  = 2*!IS_INTERLACED(*mb_type);
1210
1211             for(i8=0; i8<4; i8++){
1212                 const int x8 = i8&1;
1213                 const int y8 = i8>>1;
1214                 int ref0, scale;
1215                 const int16_t (*l1mv)[2]= l1mv0;
1216
1217                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1218                     continue;
1219                 h->sub_mb_type[i8] = sub_mb_type;
1220
1221                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1222                 if(IS_INTRA(mb_type_col[y8])){
1223                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1224                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1225                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1226                     continue;
1227                 }
1228
1229                 ref0 = l1ref0[x8 + y8*b8_stride];
1230                 if(ref0 >= 0)
1231                     ref0 = map_col_to_list0[0][ref0 + ref_offset];
1232                 else{
1233                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1234                     l1mv= l1mv1;
1235                 }
1236                 scale = dist_scale_factor[ref0];
1237                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1238
1239                 {
1240                     const int16_t *mv_col = l1mv[x8*3 + y8*b4_stride];
1241                     int my_col = (mv_col[1]<<y_shift)/2;
1242                     int mx = (scale * mv_col[0] + 128) >> 8;
1243                     int my = (scale * my_col + 128) >> 8;
1244                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1245                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1246                 }
1247             }
1248             return;
1249         }
1250
1251         /* one-to-one mv scaling */
1252
1253         if(IS_16X16(*mb_type)){
1254             int ref, mv0, mv1;
1255
1256             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1257             if(IS_INTRA(mb_type_col[0])){
1258                 ref=mv0=mv1=0;
1259             }else{
1260                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0] + ref_offset]
1261                                                 : map_col_to_list0[1][l1ref1[0] + ref_offset];
1262                 const int scale = dist_scale_factor[ref0];
1263                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1264                 int mv_l0[2];
1265                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1266                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1267                 ref= ref0;
1268                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1269                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1270             }
1271             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1272             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1273             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1274         }else{
1275             for(i8=0; i8<4; i8++){
1276                 const int x8 = i8&1;
1277                 const int y8 = i8>>1;
1278                 int ref0, scale;
1279                 const int16_t (*l1mv)[2]= l1mv0;
1280
1281                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1282                     continue;
1283                 h->sub_mb_type[i8] = sub_mb_type;
1284                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1285                 if(IS_INTRA(mb_type_col[0])){
1286                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1287                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1288                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1289                     continue;
1290                 }
1291
1292                 ref0 = l1ref0[x8 + y8*b8_stride] + ref_offset;
1293                 if(ref0 >= 0)
1294                     ref0 = map_col_to_list0[0][ref0];
1295                 else{
1296                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1297                     l1mv= l1mv1;
1298                 }
1299                 scale = dist_scale_factor[ref0];
1300
1301                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1302                 if(IS_SUB_8X8(sub_mb_type)){
1303                     const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1304                     int mx = (scale * mv_col[0] + 128) >> 8;
1305                     int my = (scale * mv_col[1] + 128) >> 8;
1306                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1307                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1308                 }else
1309                 for(i4=0; i4<4; i4++){
1310                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1311                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1312                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1313                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1314                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1315                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1316                 }
1317             }
1318         }
1319     }
1320 }
1321
1322 static inline void write_back_motion(H264Context *h, int mb_type){
1323     MpegEncContext * const s = &h->s;
1324     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1325     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1326     int list;
1327
1328     if(!USES_LIST(mb_type, 0))
1329         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1330
1331     for(list=0; list<h->list_count; list++){
1332         int y;
1333         if(!USES_LIST(mb_type, list))
1334             continue;
1335
1336         for(y=0; y<4; y++){
1337             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1338             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1339         }
1340         if( h->pps.cabac ) {
1341             if(IS_SKIP(mb_type))
1342                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1343             else
1344             for(y=0; y<4; y++){
1345                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1346                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1347             }
1348         }
1349
1350         {
1351             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1352             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1353             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1354             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1355             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1356         }
1357     }
1358
1359     if(h->slice_type_nos == FF_B_TYPE && h->pps.cabac){
1360         if(IS_8X8(mb_type)){
1361             uint8_t *direct_table = &h->direct_table[b8_xy];
1362             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1363             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1364             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1365         }
1366     }
1367 }
1368
1369 /**
1370  * Decodes a network abstraction layer unit.
1371  * @param consumed is the number of bytes used as input
1372  * @param length is the length of the array
1373  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1374  * @returns decoded bytes, might be src+1 if no escapes
1375  */
1376 static const uint8_t *decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1377     int i, si, di;
1378     uint8_t *dst;
1379     int bufidx;
1380
1381 //    src[0]&0x80;                //forbidden bit
1382     h->nal_ref_idc= src[0]>>5;
1383     h->nal_unit_type= src[0]&0x1F;
1384
1385     src++; length--;
1386 #if 0
1387     for(i=0; i<length; i++)
1388         printf("%2X ", src[i]);
1389 #endif
1390     for(i=0; i+1<length; i+=2){
1391         if(src[i]) continue;
1392         if(i>0 && src[i-1]==0) i--;
1393         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1394             if(src[i+2]!=3){
1395                 /* startcode, so we must be past the end */
1396                 length=i;
1397             }
1398             break;
1399         }
1400     }
1401
1402     if(i>=length-1){ //no escaped 0
1403         *dst_length= length;
1404         *consumed= length+1; //+1 for the header
1405         return src;
1406     }
1407
1408     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1409     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length);
1410     dst= h->rbsp_buffer[bufidx];
1411
1412     if (dst == NULL){
1413         return NULL;
1414     }
1415
1416 //printf("decoding esc\n");
1417     si=di=0;
1418     while(si<length){
1419         //remove escapes (very rare 1:2^22)
1420         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1421             if(src[si+2]==3){ //escape
1422                 dst[di++]= 0;
1423                 dst[di++]= 0;
1424                 si+=3;
1425                 continue;
1426             }else //next start code
1427                 break;
1428         }
1429
1430         dst[di++]= src[si++];
1431     }
1432
1433     *dst_length= di;
1434     *consumed= si + 1;//+1 for the header
1435 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1436     return dst;
1437 }
1438
1439 /**
1440  * identifies the exact end of the bitstream
1441  * @return the length of the trailing, or 0 if damaged
1442  */
1443 static int decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1444     int v= *src;
1445     int r;
1446
1447     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1448
1449     for(r=1; r<9; r++){
1450         if(v&1) return r;
1451         v>>=1;
1452     }
1453     return 0;
1454 }
1455
1456 /**
1457  * IDCT transforms the 16 dc values and dequantizes them.
1458  * @param qp quantization parameter
1459  */
1460 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1461 #define stride 16
1462     int i;
1463     int temp[16]; //FIXME check if this is a good idea
1464     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1465     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1466
1467 //memset(block, 64, 2*256);
1468 //return;
1469     for(i=0; i<4; i++){
1470         const int offset= y_offset[i];
1471         const int z0= block[offset+stride*0] + block[offset+stride*4];
1472         const int z1= block[offset+stride*0] - block[offset+stride*4];
1473         const int z2= block[offset+stride*1] - block[offset+stride*5];
1474         const int z3= block[offset+stride*1] + block[offset+stride*5];
1475
1476         temp[4*i+0]= z0+z3;
1477         temp[4*i+1]= z1+z2;
1478         temp[4*i+2]= z1-z2;
1479         temp[4*i+3]= z0-z3;
1480     }
1481
1482     for(i=0; i<4; i++){
1483         const int offset= x_offset[i];
1484         const int z0= temp[4*0+i] + temp[4*2+i];
1485         const int z1= temp[4*0+i] - temp[4*2+i];
1486         const int z2= temp[4*1+i] - temp[4*3+i];
1487         const int z3= temp[4*1+i] + temp[4*3+i];
1488
1489         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
1490         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1491         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1492         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1493     }
1494 }
1495
1496 #if 0
1497 /**
1498  * DCT transforms the 16 dc values.
1499  * @param qp quantization parameter ??? FIXME
1500  */
1501 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1502 //    const int qmul= dequant_coeff[qp][0];
1503     int i;
1504     int temp[16]; //FIXME check if this is a good idea
1505     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1506     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1507
1508     for(i=0; i<4; i++){
1509         const int offset= y_offset[i];
1510         const int z0= block[offset+stride*0] + block[offset+stride*4];
1511         const int z1= block[offset+stride*0] - block[offset+stride*4];
1512         const int z2= block[offset+stride*1] - block[offset+stride*5];
1513         const int z3= block[offset+stride*1] + block[offset+stride*5];
1514
1515         temp[4*i+0]= z0+z3;
1516         temp[4*i+1]= z1+z2;
1517         temp[4*i+2]= z1-z2;
1518         temp[4*i+3]= z0-z3;
1519     }
1520
1521     for(i=0; i<4; i++){
1522         const int offset= x_offset[i];
1523         const int z0= temp[4*0+i] + temp[4*2+i];
1524         const int z1= temp[4*0+i] - temp[4*2+i];
1525         const int z2= temp[4*1+i] - temp[4*3+i];
1526         const int z3= temp[4*1+i] + temp[4*3+i];
1527
1528         block[stride*0 +offset]= (z0 + z3)>>1;
1529         block[stride*2 +offset]= (z1 + z2)>>1;
1530         block[stride*8 +offset]= (z1 - z2)>>1;
1531         block[stride*10+offset]= (z0 - z3)>>1;
1532     }
1533 }
1534 #endif
1535
1536 #undef xStride
1537 #undef stride
1538
1539 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1540     const int stride= 16*2;
1541     const int xStride= 16;
1542     int a,b,c,d,e;
1543
1544     a= block[stride*0 + xStride*0];
1545     b= block[stride*0 + xStride*1];
1546     c= block[stride*1 + xStride*0];
1547     d= block[stride*1 + xStride*1];
1548
1549     e= a-b;
1550     a= a+b;
1551     b= c-d;
1552     c= c+d;
1553
1554     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1555     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1556     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1557     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1558 }
1559
1560 #if 0
1561 static void chroma_dc_dct_c(DCTELEM *block){
1562     const int stride= 16*2;
1563     const int xStride= 16;
1564     int a,b,c,d,e;
1565
1566     a= block[stride*0 + xStride*0];
1567     b= block[stride*0 + xStride*1];
1568     c= block[stride*1 + xStride*0];
1569     d= block[stride*1 + xStride*1];
1570
1571     e= a-b;
1572     a= a+b;
1573     b= c-d;
1574     c= c+d;
1575
1576     block[stride*0 + xStride*0]= (a+c);
1577     block[stride*0 + xStride*1]= (e+b);
1578     block[stride*1 + xStride*0]= (a-c);
1579     block[stride*1 + xStride*1]= (e-b);
1580 }
1581 #endif
1582
1583 /**
1584  * gets the chroma qp.
1585  */
1586 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1587     return h->pps.chroma_qp_table[t][qscale];
1588 }
1589
1590 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1591                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1592                            int src_x_offset, int src_y_offset,
1593                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1594     MpegEncContext * const s = &h->s;
1595     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1596     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1597     const int luma_xy= (mx&3) + ((my&3)<<2);
1598     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1599     uint8_t * src_cb, * src_cr;
1600     int extra_width= h->emu_edge_width;
1601     int extra_height= h->emu_edge_height;
1602     int emu=0;
1603     const int full_mx= mx>>2;
1604     const int full_my= my>>2;
1605     const int pic_width  = 16*s->mb_width;
1606     const int pic_height = 16*s->mb_height >> MB_FIELD;
1607
1608     if(mx&7) extra_width -= 3;
1609     if(my&7) extra_height -= 3;
1610
1611     if(   full_mx < 0-extra_width
1612        || full_my < 0-extra_height
1613        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1614        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1615         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1616             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1617         emu=1;
1618     }
1619
1620     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1621     if(!square){
1622         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1623     }
1624
1625     if(ENABLE_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1626
1627     if(MB_FIELD){
1628         // chroma offset when predicting from a field of opposite parity
1629         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1630         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1631     }
1632     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1633     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1634
1635     if(emu){
1636         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1637             src_cb= s->edge_emu_buffer;
1638     }
1639     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1640
1641     if(emu){
1642         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1643             src_cr= s->edge_emu_buffer;
1644     }
1645     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1646 }
1647
1648 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1649                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1650                            int x_offset, int y_offset,
1651                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1652                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1653                            int list0, int list1){
1654     MpegEncContext * const s = &h->s;
1655     qpel_mc_func *qpix_op=  qpix_put;
1656     h264_chroma_mc_func chroma_op= chroma_put;
1657
1658     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1659     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1660     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1661     x_offset += 8*s->mb_x;
1662     y_offset += 8*(s->mb_y >> MB_FIELD);
1663
1664     if(list0){
1665         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1666         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1667                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1668                            qpix_op, chroma_op);
1669
1670         qpix_op=  qpix_avg;
1671         chroma_op= chroma_avg;
1672     }
1673
1674     if(list1){
1675         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1676         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1677                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1678                            qpix_op, chroma_op);
1679     }
1680 }
1681
1682 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1683                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1684                            int x_offset, int y_offset,
1685                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1686                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1687                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1688                            int list0, int list1){
1689     MpegEncContext * const s = &h->s;
1690
1691     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1692     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1693     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1694     x_offset += 8*s->mb_x;
1695     y_offset += 8*(s->mb_y >> MB_FIELD);
1696
1697     if(list0 && list1){
1698         /* don't optimize for luma-only case, since B-frames usually
1699          * use implicit weights => chroma too. */
1700         uint8_t *tmp_cb = s->obmc_scratchpad;
1701         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1702         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1703         int refn0 = h->ref_cache[0][ scan8[n] ];
1704         int refn1 = h->ref_cache[1][ scan8[n] ];
1705
1706         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1707                     dest_y, dest_cb, dest_cr,
1708                     x_offset, y_offset, qpix_put, chroma_put);
1709         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1710                     tmp_y, tmp_cb, tmp_cr,
1711                     x_offset, y_offset, qpix_put, chroma_put);
1712
1713         if(h->use_weight == 2){
1714             int weight0 = h->implicit_weight[refn0][refn1];
1715             int weight1 = 64 - weight0;
1716             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1717             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1718             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1719         }else{
1720             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1721                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1722                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1723             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1724                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1725                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1726             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1727                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1728                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1729         }
1730     }else{
1731         int list = list1 ? 1 : 0;
1732         int refn = h->ref_cache[list][ scan8[n] ];
1733         Picture *ref= &h->ref_list[list][refn];
1734         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1735                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1736                     qpix_put, chroma_put);
1737
1738         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1739                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1740         if(h->use_weight_chroma){
1741             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1742                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1743             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1744                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1745         }
1746     }
1747 }
1748
1749 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1750                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1751                            int x_offset, int y_offset,
1752                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1753                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1754                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1755                            int list0, int list1){
1756     if((h->use_weight==2 && list0 && list1
1757         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1758        || h->use_weight==1)
1759         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1760                          x_offset, y_offset, qpix_put, chroma_put,
1761                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1762     else
1763         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1764                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1765 }
1766
1767 static inline void prefetch_motion(H264Context *h, int list){
1768     /* fetch pixels for estimated mv 4 macroblocks ahead
1769      * optimized for 64byte cache lines */
1770     MpegEncContext * const s = &h->s;
1771     const int refn = h->ref_cache[list][scan8[0]];
1772     if(refn >= 0){
1773         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1774         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1775         uint8_t **src= h->ref_list[list][refn].data;
1776         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1777         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1778         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1779         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1780     }
1781 }
1782
1783 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1784                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1785                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1786                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1787     MpegEncContext * const s = &h->s;
1788     const int mb_xy= h->mb_xy;
1789     const int mb_type= s->current_picture.mb_type[mb_xy];
1790
1791     assert(IS_INTER(mb_type));
1792
1793     prefetch_motion(h, 0);
1794
1795     if(IS_16X16(mb_type)){
1796         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1797                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1798                 &weight_op[0], &weight_avg[0],
1799                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1800     }else if(IS_16X8(mb_type)){
1801         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1802                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1803                 &weight_op[1], &weight_avg[1],
1804                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1805         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1806                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1807                 &weight_op[1], &weight_avg[1],
1808                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1809     }else if(IS_8X16(mb_type)){
1810         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1811                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1812                 &weight_op[2], &weight_avg[2],
1813                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1814         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1815                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1816                 &weight_op[2], &weight_avg[2],
1817                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1818     }else{
1819         int i;
1820
1821         assert(IS_8X8(mb_type));
1822
1823         for(i=0; i<4; i++){
1824             const int sub_mb_type= h->sub_mb_type[i];
1825             const int n= 4*i;
1826             int x_offset= (i&1)<<2;
1827             int y_offset= (i&2)<<1;
1828
1829             if(IS_SUB_8X8(sub_mb_type)){
1830                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1831                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1832                     &weight_op[3], &weight_avg[3],
1833                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1834             }else if(IS_SUB_8X4(sub_mb_type)){
1835                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1836                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1837                     &weight_op[4], &weight_avg[4],
1838                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1839                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1840                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1841                     &weight_op[4], &weight_avg[4],
1842                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1843             }else if(IS_SUB_4X8(sub_mb_type)){
1844                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1845                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1846                     &weight_op[5], &weight_avg[5],
1847                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1848                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1849                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1850                     &weight_op[5], &weight_avg[5],
1851                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1852             }else{
1853                 int j;
1854                 assert(IS_SUB_4X4(sub_mb_type));
1855                 for(j=0; j<4; j++){
1856                     int sub_x_offset= x_offset + 2*(j&1);
1857                     int sub_y_offset= y_offset +   (j&2);
1858                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1859                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1860                         &weight_op[6], &weight_avg[6],
1861                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1862                 }
1863             }
1864         }
1865     }
1866
1867     prefetch_motion(h, 1);
1868 }
1869
1870 static av_cold void decode_init_vlc(void){
1871     static int done = 0;
1872
1873     if (!done) {
1874         int i;
1875         int offset;
1876         done = 1;
1877
1878         chroma_dc_coeff_token_vlc.table = chroma_dc_coeff_token_vlc_table;
1879         chroma_dc_coeff_token_vlc.table_allocated = chroma_dc_coeff_token_vlc_table_size;
1880         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1881                  &chroma_dc_coeff_token_len [0], 1, 1,
1882                  &chroma_dc_coeff_token_bits[0], 1, 1,
1883                  INIT_VLC_USE_NEW_STATIC);
1884
1885         offset = 0;
1886         for(i=0; i<4; i++){
1887             coeff_token_vlc[i].table = coeff_token_vlc_tables+offset;
1888             coeff_token_vlc[i].table_allocated = coeff_token_vlc_tables_size[i];
1889             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1890                      &coeff_token_len [i][0], 1, 1,
1891                      &coeff_token_bits[i][0], 1, 1,
1892                      INIT_VLC_USE_NEW_STATIC);
1893             offset += coeff_token_vlc_tables_size[i];
1894         }
1895         /*
1896          * This is a one time safety check to make sure that
1897          * the packed static coeff_token_vlc table sizes
1898          * were initialized correctly.
1899          */
1900         assert(offset == FF_ARRAY_ELEMS(coeff_token_vlc_tables));
1901
1902         for(i=0; i<3; i++){
1903             chroma_dc_total_zeros_vlc[i].table = chroma_dc_total_zeros_vlc_tables[i];
1904             chroma_dc_total_zeros_vlc[i].table_allocated = chroma_dc_total_zeros_vlc_tables_size;
1905             init_vlc(&chroma_dc_total_zeros_vlc[i],
1906                      CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1907                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1908                      &chroma_dc_total_zeros_bits[i][0], 1, 1,
1909                      INIT_VLC_USE_NEW_STATIC);
1910         }
1911         for(i=0; i<15; i++){
1912             total_zeros_vlc[i].table = total_zeros_vlc_tables[i];
1913             total_zeros_vlc[i].table_allocated = total_zeros_vlc_tables_size;
1914             init_vlc(&total_zeros_vlc[i],
1915                      TOTAL_ZEROS_VLC_BITS, 16,
1916                      &total_zeros_len [i][0], 1, 1,
1917                      &total_zeros_bits[i][0], 1, 1,
1918                      INIT_VLC_USE_NEW_STATIC);
1919         }
1920
1921         for(i=0; i<6; i++){
1922             run_vlc[i].table = run_vlc_tables[i];
1923             run_vlc[i].table_allocated = run_vlc_tables_size;
1924             init_vlc(&run_vlc[i],
1925                      RUN_VLC_BITS, 7,
1926                      &run_len [i][0], 1, 1,
1927                      &run_bits[i][0], 1, 1,
1928                      INIT_VLC_USE_NEW_STATIC);
1929         }
1930         run7_vlc.table = run7_vlc_table,
1931         run7_vlc.table_allocated = run7_vlc_table_size;
1932         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
1933                  &run_len [6][0], 1, 1,
1934                  &run_bits[6][0], 1, 1,
1935                  INIT_VLC_USE_NEW_STATIC);
1936     }
1937 }
1938
1939 static void free_tables(H264Context *h){
1940     int i;
1941     H264Context *hx;
1942     av_freep(&h->intra4x4_pred_mode);
1943     av_freep(&h->chroma_pred_mode_table);
1944     av_freep(&h->cbp_table);
1945     av_freep(&h->mvd_table[0]);
1946     av_freep(&h->mvd_table[1]);
1947     av_freep(&h->direct_table);
1948     av_freep(&h->non_zero_count);
1949     av_freep(&h->slice_table_base);
1950     h->slice_table= NULL;
1951
1952     av_freep(&h->mb2b_xy);
1953     av_freep(&h->mb2b8_xy);
1954
1955     for(i = 0; i < h->s.avctx->thread_count; i++) {
1956         hx = h->thread_context[i];
1957         if(!hx) continue;
1958         av_freep(&hx->top_borders[1]);
1959         av_freep(&hx->top_borders[0]);
1960         av_freep(&hx->s.obmc_scratchpad);
1961     }
1962 }
1963
1964 static void init_dequant8_coeff_table(H264Context *h){
1965     int i,q,x;
1966     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
1967     h->dequant8_coeff[0] = h->dequant8_buffer[0];
1968     h->dequant8_coeff[1] = h->dequant8_buffer[1];
1969
1970     for(i=0; i<2; i++ ){
1971         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
1972             h->dequant8_coeff[1] = h->dequant8_buffer[0];
1973             break;
1974         }
1975
1976         for(q=0; q<52; q++){
1977             int shift = div6[q];
1978             int idx = rem6[q];
1979             for(x=0; x<64; x++)
1980                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
1981                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
1982                     h->pps.scaling_matrix8[i][x]) << shift;
1983         }
1984     }
1985 }
1986
1987 static void init_dequant4_coeff_table(H264Context *h){
1988     int i,j,q,x;
1989     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
1990     for(i=0; i<6; i++ ){
1991         h->dequant4_coeff[i] = h->dequant4_buffer[i];
1992         for(j=0; j<i; j++){
1993             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
1994                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
1995                 break;
1996             }
1997         }
1998         if(j<i)
1999             continue;
2000
2001         for(q=0; q<52; q++){
2002             int shift = div6[q] + 2;
2003             int idx = rem6[q];
2004             for(x=0; x<16; x++)
2005                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2006                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2007                     h->pps.scaling_matrix4[i][x]) << shift;
2008         }
2009     }
2010 }
2011
2012 static void init_dequant_tables(H264Context *h){
2013     int i,x;
2014     init_dequant4_coeff_table(h);
2015     if(h->pps.transform_8x8_mode)
2016         init_dequant8_coeff_table(h);
2017     if(h->sps.transform_bypass){
2018         for(i=0; i<6; i++)
2019             for(x=0; x<16; x++)
2020                 h->dequant4_coeff[i][0][x] = 1<<6;
2021         if(h->pps.transform_8x8_mode)
2022             for(i=0; i<2; i++)
2023                 for(x=0; x<64; x++)
2024                     h->dequant8_coeff[i][0][x] = 1<<6;
2025     }
2026 }
2027
2028
2029 /**
2030  * allocates tables.
2031  * needs width/height
2032  */
2033 static int alloc_tables(H264Context *h){
2034     MpegEncContext * const s = &h->s;
2035     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2036     int x,y;
2037
2038     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2039
2040     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2041     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(*h->slice_table_base))
2042     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2043
2044     CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2045     CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2046     CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2047     CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2048
2049     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(*h->slice_table_base));
2050     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2051
2052     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2053     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2054     for(y=0; y<s->mb_height; y++){
2055         for(x=0; x<s->mb_width; x++){
2056             const int mb_xy= x + y*s->mb_stride;
2057             const int b_xy = 4*x + 4*y*h->b_stride;
2058             const int b8_xy= 2*x + 2*y*h->b8_stride;
2059
2060             h->mb2b_xy [mb_xy]= b_xy;
2061             h->mb2b8_xy[mb_xy]= b8_xy;
2062         }
2063     }
2064
2065     s->obmc_scratchpad = NULL;
2066
2067     if(!h->dequant4_coeff[0])
2068         init_dequant_tables(h);
2069
2070     return 0;
2071 fail:
2072     free_tables(h);
2073     return -1;
2074 }
2075
2076 /**
2077  * Mimic alloc_tables(), but for every context thread.
2078  */
2079 static void clone_tables(H264Context *dst, H264Context *src){
2080     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2081     dst->non_zero_count           = src->non_zero_count;
2082     dst->slice_table              = src->slice_table;
2083     dst->cbp_table                = src->cbp_table;
2084     dst->mb2b_xy                  = src->mb2b_xy;
2085     dst->mb2b8_xy                 = src->mb2b8_xy;
2086     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2087     dst->mvd_table[0]             = src->mvd_table[0];
2088     dst->mvd_table[1]             = src->mvd_table[1];
2089     dst->direct_table             = src->direct_table;
2090
2091     dst->s.obmc_scratchpad = NULL;
2092     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2093 }
2094
2095 /**
2096  * Init context
2097  * Allocate buffers which are not shared amongst multiple threads.
2098  */
2099 static int context_init(H264Context *h){
2100     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2101     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2102
2103     return 0;
2104 fail:
2105     return -1; // free_tables will clean up for us
2106 }
2107
2108 static av_cold void common_init(H264Context *h){
2109     MpegEncContext * const s = &h->s;
2110
2111     s->width = s->avctx->width;
2112     s->height = s->avctx->height;
2113     s->codec_id= s->avctx->codec->id;
2114
2115     ff_h264_pred_init(&h->hpc, s->codec_id);
2116
2117     h->dequant_coeff_pps= -1;
2118     s->unrestricted_mv=1;
2119     s->decode=1; //FIXME
2120
2121     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2122     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2123 }
2124
2125 static av_cold int decode_init(AVCodecContext *avctx){
2126     H264Context *h= avctx->priv_data;
2127     MpegEncContext * const s = &h->s;
2128
2129     MPV_decode_defaults(s);
2130
2131     s->avctx = avctx;
2132     common_init(h);
2133
2134     s->out_format = FMT_H264;
2135     s->workaround_bugs= avctx->workaround_bugs;
2136
2137     // set defaults
2138 //    s->decode_mb= ff_h263_decode_mb;
2139     s->quarter_sample = 1;
2140     s->low_delay= 1;
2141
2142     if(avctx->codec_id == CODEC_ID_SVQ3)
2143         avctx->pix_fmt= PIX_FMT_YUVJ420P;
2144     else
2145         avctx->pix_fmt= PIX_FMT_YUV420P;
2146
2147     decode_init_vlc();
2148
2149     if(avctx->extradata_size > 0 && avctx->extradata &&
2150        *(char *)avctx->extradata == 1){
2151         h->is_avc = 1;
2152         h->got_avcC = 0;
2153     } else {
2154         h->is_avc = 0;
2155     }
2156
2157     h->thread_context[0] = h;
2158     h->outputed_poc = INT_MIN;
2159     h->prev_poc_msb= 1<<16;
2160     return 0;
2161 }
2162
2163 static int frame_start(H264Context *h){
2164     MpegEncContext * const s = &h->s;
2165     int i;
2166
2167     if(MPV_frame_start(s, s->avctx) < 0)
2168         return -1;
2169     ff_er_frame_start(s);
2170     /*
2171      * MPV_frame_start uses pict_type to derive key_frame.
2172      * This is incorrect for H.264; IDR markings must be used.
2173      * Zero here; IDR markings per slice in frame or fields are ORed in later.
2174      * See decode_nal_units().
2175      */
2176     s->current_picture_ptr->key_frame= 0;
2177
2178     assert(s->linesize && s->uvlinesize);
2179
2180     for(i=0; i<16; i++){
2181         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2182         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2183     }
2184     for(i=0; i<4; i++){
2185         h->block_offset[16+i]=
2186         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2187         h->block_offset[24+16+i]=
2188         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2189     }
2190
2191     /* can't be in alloc_tables because linesize isn't known there.
2192      * FIXME: redo bipred weight to not require extra buffer? */
2193     for(i = 0; i < s->avctx->thread_count; i++)
2194         if(!h->thread_context[i]->s.obmc_scratchpad)
2195             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2196
2197     /* some macroblocks will be accessed before they're available */
2198     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2199         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(*h->slice_table));
2200
2201 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2202
2203     // We mark the current picture as non-reference after allocating it, so
2204     // that if we break out due to an error it can be released automatically
2205     // in the next MPV_frame_start().
2206     // SVQ3 as well as most other codecs have only last/next/current and thus
2207     // get released even with set reference, besides SVQ3 and others do not
2208     // mark frames as reference later "naturally".
2209     if(s->codec_id != CODEC_ID_SVQ3)
2210         s->current_picture_ptr->reference= 0;
2211
2212     s->current_picture_ptr->field_poc[0]=
2213     s->current_picture_ptr->field_poc[1]= INT_MAX;
2214     assert(s->current_picture_ptr->long_ref==0);
2215
2216     return 0;
2217 }
2218
2219 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2220     MpegEncContext * const s = &h->s;
2221     int i;
2222     int step    = 1;
2223     int offset  = 1;
2224     int uvoffset= 1;
2225     int top_idx = 1;
2226     int skiplast= 0;
2227
2228     src_y  -=   linesize;
2229     src_cb -= uvlinesize;
2230     src_cr -= uvlinesize;
2231
2232     if(!simple && FRAME_MBAFF){
2233         if(s->mb_y&1){
2234             offset  = MB_MBAFF ? 1 : 17;
2235             uvoffset= MB_MBAFF ? 1 : 9;
2236             if(!MB_MBAFF){
2237                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 0)= *(uint64_t*)(src_y +  15*linesize);
2238                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 8)= *(uint64_t*)(src_y +8+15*linesize);
2239                 if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2240                     *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+7*uvlinesize);
2241                     *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+7*uvlinesize);
2242                 }
2243             }
2244         }else{
2245             if(!MB_MBAFF){
2246                 h->left_border[0]= h->top_borders[0][s->mb_x][15];
2247                 if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2248                     h->left_border[34   ]= h->top_borders[0][s->mb_x][16+7  ];
2249                     h->left_border[34+18]= h->top_borders[0][s->mb_x][16+8+7];
2250                 }
2251                 skiplast= 1;
2252             }
2253             offset  =
2254             uvoffset=
2255             top_idx = MB_MBAFF ? 0 : 1;
2256         }
2257         step= MB_MBAFF ? 2 : 1;
2258     }
2259
2260     // There are two lines saved, the line above the the top macroblock of a pair,
2261     // and the line above the bottom macroblock
2262     h->left_border[offset]= h->top_borders[top_idx][s->mb_x][15];
2263     for(i=1; i<17 - skiplast; i++){
2264         h->left_border[offset+i*step]= src_y[15+i*  linesize];
2265     }
2266
2267     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2268     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2269
2270     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2271         h->left_border[uvoffset+34   ]= h->top_borders[top_idx][s->mb_x][16+7];
2272         h->left_border[uvoffset+34+18]= h->top_borders[top_idx][s->mb_x][24+7];
2273         for(i=1; i<9 - skiplast; i++){
2274             h->left_border[uvoffset+34   +i*step]= src_cb[7+i*uvlinesize];
2275             h->left_border[uvoffset+34+18+i*step]= src_cr[7+i*uvlinesize];
2276         }
2277         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2278         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2279     }
2280 }
2281
2282 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2283     MpegEncContext * const s = &h->s;
2284     int temp8, i;
2285     uint64_t temp64;
2286     int deblock_left;
2287     int deblock_top;
2288     int mb_xy;
2289     int step    = 1;
2290     int offset  = 1;
2291     int uvoffset= 1;
2292     int top_idx = 1;
2293
2294     if(!simple && FRAME_MBAFF){
2295         if(s->mb_y&1){
2296             offset  = MB_MBAFF ? 1 : 17;
2297             uvoffset= MB_MBAFF ? 1 : 9;
2298         }else{
2299             offset  =
2300             uvoffset=
2301             top_idx = MB_MBAFF ? 0 : 1;
2302         }
2303         step= MB_MBAFF ? 2 : 1;
2304     }
2305
2306     if(h->deblocking_filter == 2) {
2307         mb_xy = h->mb_xy;
2308         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2309         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2310     } else {
2311         deblock_left = (s->mb_x > 0);
2312         deblock_top =  (s->mb_y > !!MB_FIELD);
2313     }
2314
2315     src_y  -=   linesize + 1;
2316     src_cb -= uvlinesize + 1;
2317     src_cr -= uvlinesize + 1;
2318
2319 #define XCHG(a,b,t,xchg)\
2320 t= a;\
2321 if(xchg)\
2322     a= b;\
2323 b= t;
2324
2325     if(deblock_left){
2326         for(i = !deblock_top; i<16; i++){
2327             XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, xchg);
2328         }
2329         XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, 1);
2330     }
2331
2332     if(deblock_top){
2333         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2334         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2335         if(s->mb_x+1 < s->mb_width){
2336             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2337         }
2338     }
2339
2340     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2341         if(deblock_left){
2342             for(i = !deblock_top; i<8; i++){
2343                 XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, xchg);
2344                 XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, xchg);
2345             }
2346             XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, 1);
2347             XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, 1);
2348         }
2349         if(deblock_top){
2350             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2351             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2352         }
2353     }
2354 }
2355
2356 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2357     MpegEncContext * const s = &h->s;
2358     const int mb_x= s->mb_x;
2359     const int mb_y= s->mb_y;
2360     const int mb_xy= h->mb_xy;
2361     const int mb_type= s->current_picture.mb_type[mb_xy];
2362     uint8_t  *dest_y, *dest_cb, *dest_cr;
2363     int linesize, uvlinesize /*dct_offset*/;
2364     int i;
2365     int *block_offset = &h->block_offset[0];
2366     const int transform_bypass = !simple && (s->qscale == 0 && h->sps.transform_bypass);
2367     const int is_h264 = simple || s->codec_id == CODEC_ID_H264;
2368     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2369     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2370
2371     dest_y  = s->current_picture.data[0] + (mb_x + mb_y * s->linesize  ) * 16;
2372     dest_cb = s->current_picture.data[1] + (mb_x + mb_y * s->uvlinesize) * 8;
2373     dest_cr = s->current_picture.data[2] + (mb_x + mb_y * s->uvlinesize) * 8;
2374
2375     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2376     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2377
2378     if (!simple && MB_FIELD) {
2379         linesize   = h->mb_linesize   = s->linesize * 2;
2380         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2381         block_offset = &h->block_offset[24];
2382         if(mb_y&1){ //FIXME move out of this function?
2383             dest_y -= s->linesize*15;
2384             dest_cb-= s->uvlinesize*7;
2385             dest_cr-= s->uvlinesize*7;
2386         }
2387         if(FRAME_MBAFF) {
2388             int list;
2389             for(list=0; list<h->list_count; list++){
2390                 if(!USES_LIST(mb_type, list))
2391                     continue;
2392                 if(IS_16X16(mb_type)){
2393                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2394                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2395                 }else{
2396                     for(i=0; i<16; i+=4){
2397                         int ref = h->ref_cache[list][scan8[i]];
2398                         if(ref >= 0)
2399                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2400                     }
2401                 }
2402             }
2403         }
2404     } else {
2405         linesize   = h->mb_linesize   = s->linesize;
2406         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2407 //        dct_offset = s->linesize * 16;
2408     }
2409
2410     if (!simple && IS_INTRA_PCM(mb_type)) {
2411         for (i=0; i<16; i++) {
2412             memcpy(dest_y + i*  linesize, h->mb       + i*8, 16);
2413         }
2414         for (i=0; i<8; i++) {
2415             memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
2416             memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
2417         }
2418     } else {
2419         if(IS_INTRA(mb_type)){
2420             if(h->deblocking_filter)
2421                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2422
2423             if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2424                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2425                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2426             }
2427
2428             if(IS_INTRA4x4(mb_type)){
2429                 if(simple || !s->encoding){
2430                     if(IS_8x8DCT(mb_type)){
2431                         if(transform_bypass){
2432                             idct_dc_add =
2433                             idct_add    = s->dsp.add_pixels8;
2434                         }else{
2435                             idct_dc_add = s->dsp.h264_idct8_dc_add;
2436                             idct_add    = s->dsp.h264_idct8_add;
2437                         }
2438                         for(i=0; i<16; i+=4){
2439                             uint8_t * const ptr= dest_y + block_offset[i];
2440                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2441                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2442                                 h->hpc.pred8x8l_add[dir](ptr, h->mb + i*16, linesize);
2443                             }else{
2444                                 const int nnz = h->non_zero_count_cache[ scan8[i] ];
2445                                 h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2446                                                             (h->topright_samples_available<<i)&0x4000, linesize);
2447                                 if(nnz){
2448                                     if(nnz == 1 && h->mb[i*16])
2449                                         idct_dc_add(ptr, h->mb + i*16, linesize);
2450                                     else
2451                                         idct_add   (ptr, h->mb + i*16, linesize);
2452                                 }
2453                             }
2454                         }
2455                     }else{
2456                         if(transform_bypass){
2457                             idct_dc_add =
2458                             idct_add    = s->dsp.add_pixels4;
2459                         }else{
2460                             idct_dc_add = s->dsp.h264_idct_dc_add;
2461                             idct_add    = s->dsp.h264_idct_add;
2462                         }
2463                         for(i=0; i<16; i++){
2464                             uint8_t * const ptr= dest_y + block_offset[i];
2465                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2466
2467                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2468                                 h->hpc.pred4x4_add[dir](ptr, h->mb + i*16, linesize);
2469                             }else{
2470                                 uint8_t *topright;
2471                                 int nnz, tr;
2472                                 if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2473                                     const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2474                                     assert(mb_y || linesize <= block_offset[i]);
2475                                     if(!topright_avail){
2476                                         tr= ptr[3 - linesize]*0x01010101;
2477                                         topright= (uint8_t*) &tr;
2478                                     }else
2479                                         topright= ptr + 4 - linesize;
2480                                 }else
2481                                     topright= NULL;
2482
2483                                 h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2484                                 nnz = h->non_zero_count_cache[ scan8[i] ];
2485                                 if(nnz){
2486                                     if(is_h264){
2487                                         if(nnz == 1 && h->mb[i*16])
2488                                             idct_dc_add(ptr, h->mb + i*16, linesize);
2489                                         else
2490                                             idct_add   (ptr, h->mb + i*16, linesize);
2491                                     }else
2492                                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2493                                 }
2494                             }
2495                         }
2496                     }
2497                 }
2498             }else{
2499                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2500                 if(is_h264){
2501                     if(!transform_bypass)
2502                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2503                 }else
2504                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2505             }
2506             if(h->deblocking_filter)
2507                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2508         }else if(is_h264){
2509             hl_motion(h, dest_y, dest_cb, dest_cr,
2510                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2511                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2512                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2513         }
2514
2515
2516         if(!IS_INTRA4x4(mb_type)){
2517             if(is_h264){
2518                 if(IS_INTRA16x16(mb_type)){
2519                     if(transform_bypass){
2520                         if(h->sps.profile_idc==244 && (h->intra16x16_pred_mode==VERT_PRED8x8 || h->intra16x16_pred_mode==HOR_PRED8x8)){
2521                             h->hpc.pred16x16_add[h->intra16x16_pred_mode](dest_y, block_offset, h->mb, linesize);
2522                         }else{
2523                             for(i=0; i<16; i++){
2524                                 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2525                                     s->dsp.add_pixels4(dest_y + block_offset[i], h->mb + i*16, linesize);
2526                             }
2527                         }
2528                     }else{
2529                          s->dsp.h264_idct_add16intra(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2530                     }
2531                 }else if(h->cbp&15){
2532                     if(transform_bypass){
2533                         const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2534                         idct_add= IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2535                         for(i=0; i<16; i+=di){
2536                             if(h->non_zero_count_cache[ scan8[i] ]){
2537                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2538                             }
2539                         }
2540                     }else{
2541                         if(IS_8x8DCT(mb_type)){
2542                             s->dsp.h264_idct8_add4(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2543                         }else{
2544                             s->dsp.h264_idct_add16(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2545                         }
2546                     }
2547                 }
2548             }else{
2549                 for(i=0; i<16; i++){
2550                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2551                         uint8_t * const ptr= dest_y + block_offset[i];
2552                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2553                     }
2554                 }
2555             }
2556         }
2557
2558         if((simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)) && (h->cbp&0x30)){
2559             uint8_t *dest[2] = {dest_cb, dest_cr};
2560             if(transform_bypass){
2561                 if(IS_INTRA(mb_type) && h->sps.profile_idc==244 && (h->chroma_pred_mode==VERT_PRED8x8 || h->chroma_pred_mode==HOR_PRED8x8)){
2562                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0], block_offset + 16, h->mb + 16*16, uvlinesize);
2563                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1], block_offset + 20, h->mb + 20*16, uvlinesize);
2564                 }else{
2565                     idct_add = s->dsp.add_pixels4;
2566                     for(i=16; i<16+8; i++){
2567                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2568                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2569                     }
2570                 }
2571             }else{
2572                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2573                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2574                 if(is_h264){
2575                     idct_add = s->dsp.h264_idct_add;
2576                     idct_dc_add = s->dsp.h264_idct_dc_add;
2577                     for(i=16; i<16+8; i++){
2578                         if(h->non_zero_count_cache[ scan8[i] ])
2579                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2580                         else if(h->mb[i*16])
2581                             idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2582                     }
2583                 }else{
2584                     for(i=16; i<16+8; i++){
2585                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2586                             uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2587                             svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2588                         }
2589                     }
2590                 }
2591             }
2592         }
2593     }
2594     if(h->deblocking_filter) {
2595         backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2596         fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2597         h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2598         h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2599         if (!simple && FRAME_MBAFF) {
2600             filter_mb     (h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2601         } else {
2602             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2603         }
2604     }
2605 }
2606
2607 /**
2608  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2609  */
2610 static void hl_decode_mb_simple(H264Context *h){
2611     hl_decode_mb_internal(h, 1);
2612 }
2613
2614 /**
2615  * Process a macroblock; this handles edge cases, such as interlacing.
2616  */
2617 static void av_noinline hl_decode_mb_complex(H264Context *h){
2618     hl_decode_mb_internal(h, 0);
2619 }
2620
2621 static void hl_decode_mb(H264Context *h){
2622     MpegEncContext * const s = &h->s;
2623     const int mb_xy= h->mb_xy;
2624     const int mb_type= s->current_picture.mb_type[mb_xy];
2625     int is_complex = ENABLE_SMALL || h->is_complex || IS_INTRA_PCM(mb_type) || s->qscale == 0;
2626
2627     if(ENABLE_H264_ENCODER && !s->decode)
2628         return;
2629
2630     if (is_complex)
2631         hl_decode_mb_complex(h);
2632     else hl_decode_mb_simple(h);
2633 }
2634
2635 static void pic_as_field(Picture *pic, const int parity){
2636     int i;
2637     for (i = 0; i < 4; ++i) {
2638         if (parity == PICT_BOTTOM_FIELD)
2639             pic->data[i] += pic->linesize[i];
2640         pic->reference = parity;
2641         pic->linesize[i] *= 2;
2642     }
2643     pic->poc= pic->field_poc[parity == PICT_BOTTOM_FIELD];
2644 }
2645
2646 static int split_field_copy(Picture *dest, Picture *src,
2647                             int parity, int id_add){
2648     int match = !!(src->reference & parity);
2649
2650     if (match) {
2651         *dest = *src;
2652         if(parity != PICT_FRAME){
2653             pic_as_field(dest, parity);
2654             dest->pic_id *= 2;
2655             dest->pic_id += id_add;
2656         }
2657     }
2658
2659     return match;
2660 }
2661
2662 static int build_def_list(Picture *def, Picture **in, int len, int is_long, int sel){
2663     int i[2]={0};
2664     int index=0;
2665
2666     while(i[0]<len || i[1]<len){
2667         while(i[0]<len && !(in[ i[0] ] && (in[ i[0] ]->reference & sel)))
2668             i[0]++;
2669         while(i[1]<len && !(in[ i[1] ] && (in[ i[1] ]->reference & (sel^3))))
2670             i[1]++;
2671         if(i[0] < len){
2672             in[ i[0] ]->pic_id= is_long ? i[0] : in[ i[0] ]->frame_num;
2673             split_field_copy(&def[index++], in[ i[0]++ ], sel  , 1);
2674         }
2675         if(i[1] < len){
2676             in[ i[1] ]->pic_id= is_long ? i[1] : in[ i[1] ]->frame_num;
2677             split_field_copy(&def[index++], in[ i[1]++ ], sel^3, 0);
2678         }
2679     }
2680
2681     return index;
2682 }
2683
2684 static int add_sorted(Picture **sorted, Picture **src, int len, int limit, int dir){
2685     int i, best_poc;
2686     int out_i= 0;
2687
2688     for(;;){
2689         best_poc= dir ? INT_MIN : INT_MAX;
2690
2691         for(i=0; i<len; i++){
2692             const int poc= src[i]->poc;
2693             if(((poc > limit) ^ dir) && ((poc < best_poc) ^ dir)){
2694                 best_poc= poc;
2695                 sorted[out_i]= src[i];
2696             }
2697         }
2698         if(best_poc == (dir ? INT_MIN : INT_MAX))
2699             break;
2700         limit= sorted[out_i++]->poc - dir;
2701     }
2702     return out_i;
2703 }
2704
2705 /**
2706  * fills the default_ref_list.
2707  */
2708 static int fill_default_ref_list(H264Context *h){
2709     MpegEncContext * const s = &h->s;
2710     int i, len;
2711
2712     if(h->slice_type_nos==FF_B_TYPE){
2713         Picture *sorted[32];
2714         int cur_poc, list;
2715         int lens[2];
2716
2717         if(FIELD_PICTURE)
2718             cur_poc= s->current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
2719         else
2720             cur_poc= s->current_picture_ptr->poc;
2721
2722         for(list= 0; list<2; list++){
2723             len= add_sorted(sorted    , h->short_ref, h->short_ref_count, cur_poc, 1^list);
2724             len+=add_sorted(sorted+len, h->short_ref, h->short_ref_count, cur_poc, 0^list);
2725             assert(len<=32);
2726             len= build_def_list(h->default_ref_list[list]    , sorted     , len, 0, s->picture_structure);
2727             len+=build_def_list(h->default_ref_list[list]+len, h->long_ref, 16 , 1, s->picture_structure);
2728             assert(len<=32);
2729
2730             if(len < h->ref_count[list])
2731                 memset(&h->default_ref_list[list][len], 0, sizeof(Picture)*(h->ref_count[list] - len));
2732             lens[list]= len;
2733         }
2734
2735         if(lens[0] == lens[1] && lens[1] > 1){
2736             for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0] && i<lens[0]; i++);
2737             if(i == lens[0])
2738                 FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2739         }
2740     }else{
2741         len = build_def_list(h->default_ref_list[0]    , h->short_ref, h->short_ref_count, 0, s->picture_structure);
2742         len+= build_def_list(h->default_ref_list[0]+len, h-> long_ref, 16                , 1, s->picture_structure);
2743         assert(len <= 32);
2744         if(len < h->ref_count[0])
2745             memset(&h->default_ref_list[0][len], 0, sizeof(Picture)*(h->ref_count[0] - len));
2746     }
2747 #ifdef TRACE
2748     for (i=0; i<h->ref_count[0]; i++) {
2749         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2750     }
2751     if(h->slice_type_nos==FF_B_TYPE){
2752         for (i=0; i<h->ref_count[1]; i++) {
2753             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
2754         }
2755     }
2756 #endif
2757     return 0;
2758 }
2759
2760 static void print_short_term(H264Context *h);
2761 static void print_long_term(H264Context *h);
2762
2763 /**
2764  * Extract structure information about the picture described by pic_num in
2765  * the current decoding context (frame or field). Note that pic_num is
2766  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
2767  * @param pic_num picture number for which to extract structure information
2768  * @param structure one of PICT_XXX describing structure of picture
2769  *                      with pic_num
2770  * @return frame number (short term) or long term index of picture
2771  *         described by pic_num
2772  */
2773 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
2774     MpegEncContext * const s = &h->s;
2775
2776     *structure = s->picture_structure;
2777     if(FIELD_PICTURE){
2778         if (!(pic_num & 1))
2779             /* opposite field */
2780             *structure ^= PICT_FRAME;
2781         pic_num >>= 1;
2782     }
2783
2784     return pic_num;
2785 }
2786
2787 static int decode_ref_pic_list_reordering(H264Context *h){
2788     MpegEncContext * const s = &h->s;
2789     int list, index, pic_structure;
2790
2791     print_short_term(h);
2792     print_long_term(h);
2793
2794     for(list=0; list<h->list_count; list++){
2795         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
2796
2797         if(get_bits1(&s->gb)){
2798             int pred= h->curr_pic_num;
2799
2800             for(index=0; ; index++){
2801                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
2802                 unsigned int pic_id;
2803                 int i;
2804                 Picture *ref = NULL;
2805
2806                 if(reordering_of_pic_nums_idc==3)
2807                     break;
2808
2809                 if(index >= h->ref_count[list]){
2810                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
2811                     return -1;
2812                 }
2813
2814                 if(reordering_of_pic_nums_idc<3){
2815                     if(reordering_of_pic_nums_idc<2){
2816                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
2817                         int frame_num;
2818
2819                         if(abs_diff_pic_num > h->max_pic_num){
2820                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
2821                             return -1;
2822                         }
2823
2824                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
2825                         else                                pred+= abs_diff_pic_num;
2826                         pred &= h->max_pic_num - 1;
2827
2828                         frame_num = pic_num_extract(h, pred, &pic_structure);
2829
2830                         for(i= h->short_ref_count-1; i>=0; i--){
2831                             ref = h->short_ref[i];
2832                             assert(ref->reference);
2833                             assert(!ref->long_ref);
2834                             if(
2835                                    ref->frame_num == frame_num &&
2836                                    (ref->reference & pic_structure)
2837                               )
2838                                 break;
2839                         }
2840                         if(i>=0)
2841                             ref->pic_id= pred;
2842                     }else{
2843                         int long_idx;
2844                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
2845
2846                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
2847
2848                         if(long_idx>31){
2849                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
2850                             return -1;
2851                         }
2852                         ref = h->long_ref[long_idx];
2853                         assert(!(ref && !ref->reference));
2854                         if(ref && (ref->reference & pic_structure)){
2855                             ref->pic_id= pic_id;
2856                             assert(ref->long_ref);
2857                             i=0;
2858                         }else{
2859                             i=-1;
2860                         }
2861                     }
2862
2863                     if (i < 0) {
2864                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
2865                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
2866                     } else {
2867                         for(i=index; i+1<h->ref_count[list]; i++){
2868                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
2869                                 break;
2870                         }
2871                         for(; i > index; i--){
2872                             h->ref_list[list][i]= h->ref_list[list][i-1];
2873                         }
2874                         h->ref_list[list][index]= *ref;
2875                         if (FIELD_PICTURE){
2876                             pic_as_field(&h->ref_list[list][index], pic_structure);
2877                         }
2878                     }
2879                 }else{
2880                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
2881                     return -1;
2882                 }
2883             }
2884         }
2885     }
2886     for(list=0; list<h->list_count; list++){
2887         for(index= 0; index < h->ref_count[list]; index++){
2888             if(!h->ref_list[list][index].data[0]){
2889                 av_log(h->s.avctx, AV_LOG_ERROR, "Missing reference picture\n");
2890                 h->ref_list[list][index]= s->current_picture; //FIXME this is not a sensible solution
2891             }
2892         }
2893     }
2894
2895     return 0;
2896 }
2897
2898 static void fill_mbaff_ref_list(H264Context *h){
2899     int list, i, j;
2900     for(list=0; list<2; list++){ //FIXME try list_count
2901         for(i=0; i<h->ref_count[list]; i++){
2902             Picture *frame = &h->ref_list[list][i];
2903             Picture *field = &h->ref_list[list][16+2*i];
2904             field[0] = *frame;
2905             for(j=0; j<3; j++)
2906                 field[0].linesize[j] <<= 1;
2907             field[0].reference = PICT_TOP_FIELD;
2908             field[0].poc= field[0].field_poc[0];
2909             field[1] = field[0];
2910             for(j=0; j<3; j++)
2911                 field[1].data[j] += frame->linesize[j];
2912             field[1].reference = PICT_BOTTOM_FIELD;
2913             field[1].poc= field[1].field_poc[1];
2914
2915             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
2916             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
2917             for(j=0; j<2; j++){
2918                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
2919                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
2920             }
2921         }
2922     }
2923     for(j=0; j<h->ref_count[1]; j++){
2924         for(i=0; i<h->ref_count[0]; i++)
2925             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
2926         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
2927         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
2928     }
2929 }
2930
2931 static int pred_weight_table(H264Context *h){
2932     MpegEncContext * const s = &h->s;
2933     int list, i;
2934     int luma_def, chroma_def;
2935
2936     h->use_weight= 0;
2937     h->use_weight_chroma= 0;
2938     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
2939     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
2940     luma_def = 1<<h->luma_log2_weight_denom;
2941     chroma_def = 1<<h->chroma_log2_weight_denom;
2942
2943     for(list=0; list<2; list++){
2944         for(i=0; i<h->ref_count[list]; i++){
2945             int luma_weight_flag, chroma_weight_flag;
2946
2947             luma_weight_flag= get_bits1(&s->gb);
2948             if(luma_weight_flag){
2949                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
2950                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
2951                 if(   h->luma_weight[list][i] != luma_def
2952                    || h->luma_offset[list][i] != 0)
2953                     h->use_weight= 1;
2954             }else{
2955                 h->luma_weight[list][i]= luma_def;
2956                 h->luma_offset[list][i]= 0;
2957             }
2958
2959             if(CHROMA){
2960                 chroma_weight_flag= get_bits1(&s->gb);
2961                 if(chroma_weight_flag){
2962                     int j;
2963                     for(j=0; j<2; j++){
2964                         h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
2965                         h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
2966                         if(   h->chroma_weight[list][i][j] != chroma_def
2967                         || h->chroma_offset[list][i][j] != 0)
2968                             h->use_weight_chroma= 1;
2969                     }
2970                 }else{
2971                     int j;
2972                     for(j=0; j<2; j++){
2973                         h->chroma_weight[list][i][j]= chroma_def;
2974                         h->chroma_offset[list][i][j]= 0;
2975                     }
2976                 }
2977             }
2978         }
2979         if(h->slice_type_nos != FF_B_TYPE) break;
2980     }
2981     h->use_weight= h->use_weight || h->use_weight_chroma;
2982     return 0;
2983 }
2984
2985 static void implicit_weight_table(H264Context *h){
2986     MpegEncContext * const s = &h->s;
2987     int ref0, ref1;
2988     int cur_poc = s->current_picture_ptr->poc;
2989
2990     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
2991        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
2992         h->use_weight= 0;
2993         h->use_weight_chroma= 0;
2994         return;
2995     }
2996
2997     h->use_weight= 2;
2998     h->use_weight_chroma= 2;
2999     h->luma_log2_weight_denom= 5;
3000     h->chroma_log2_weight_denom= 5;
3001
3002     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3003         int poc0 = h->ref_list[0][ref0].poc;
3004         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3005             int poc1 = h->ref_list[1][ref1].poc;
3006             int td = av_clip(poc1 - poc0, -128, 127);
3007             if(td){
3008                 int tb = av_clip(cur_poc - poc0, -128, 127);
3009                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3010                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3011                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3012                     h->implicit_weight[ref0][ref1] = 32;
3013                 else
3014                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3015             }else
3016                 h->implicit_weight[ref0][ref1] = 32;
3017         }
3018     }
3019 }
3020
3021 /**
3022  * Mark a picture as no longer needed for reference. The refmask
3023  * argument allows unreferencing of individual fields or the whole frame.
3024  * If the picture becomes entirely unreferenced, but is being held for
3025  * display purposes, it is marked as such.
3026  * @param refmask mask of fields to unreference; the mask is bitwise
3027  *                anded with the reference marking of pic
3028  * @return non-zero if pic becomes entirely unreferenced (except possibly
3029  *         for display purposes) zero if one of the fields remains in
3030  *         reference
3031  */
3032 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3033     int i;
3034     if (pic->reference &= refmask) {
3035         return 0;
3036     } else {
3037         for(i = 0; h->delayed_pic[i]; i++)
3038             if(pic == h->delayed_pic[i]){
3039                 pic->reference=DELAYED_PIC_REF;
3040                 break;
3041             }
3042         return 1;
3043     }
3044 }
3045
3046 /**
3047  * instantaneous decoder refresh.
3048  */
3049 static void idr(H264Context *h){
3050     int i;
3051
3052     for(i=0; i<16; i++){
3053         remove_long(h, i, 0);
3054     }
3055     assert(h->long_ref_count==0);
3056
3057     for(i=0; i<h->short_ref_count; i++){
3058         unreference_pic(h, h->short_ref[i], 0);
3059         h->short_ref[i]= NULL;
3060     }
3061     h->short_ref_count=0;
3062     h->prev_frame_num= 0;
3063     h->prev_frame_num_offset= 0;
3064     h->prev_poc_msb=
3065     h->prev_poc_lsb= 0;
3066 }
3067
3068 /* forget old pics after a seek */
3069 static void flush_dpb(AVCodecContext *avctx){
3070     H264Context *h= avctx->priv_data;
3071     int i;
3072     for(i=0; i<MAX_DELAYED_PIC_COUNT; i++) {
3073         if(h->delayed_pic[i])
3074             h->delayed_pic[i]->reference= 0;
3075         h->delayed_pic[i]= NULL;
3076     }
3077     h->outputed_poc= INT_MIN;
3078     idr(h);
3079     if(h->s.current_picture_ptr)
3080         h->s.current_picture_ptr->reference= 0;
3081     h->s.first_field= 0;
3082     ff_mpeg_flush(avctx);
3083 }
3084
3085 /**
3086  * Find a Picture in the short term reference list by frame number.
3087  * @param frame_num frame number to search for
3088  * @param idx the index into h->short_ref where returned picture is found
3089  *            undefined if no picture found.
3090  * @return pointer to the found picture, or NULL if no pic with the provided
3091  *                 frame number is found
3092  */
3093 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3094     MpegEncContext * const s = &h->s;
3095     int i;
3096
3097     for(i=0; i<h->short_ref_count; i++){
3098         Picture *pic= h->short_ref[i];
3099         if(s->avctx->debug&FF_DEBUG_MMCO)
3100             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3101         if(pic->frame_num == frame_num) {
3102             *idx = i;
3103             return pic;
3104         }
3105     }
3106     return NULL;
3107 }
3108
3109 /**
3110  * Remove a picture from the short term reference list by its index in
3111  * that list.  This does no checking on the provided index; it is assumed
3112  * to be valid. Other list entries are shifted down.
3113  * @param i index into h->short_ref of picture to remove.
3114  */
3115 static void remove_short_at_index(H264Context *h, int i){
3116     assert(i >= 0 && i < h->short_ref_count);
3117     h->short_ref[i]= NULL;
3118     if (--h->short_ref_count)
3119         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3120 }
3121
3122 /**
3123  *
3124  * @return the removed picture or NULL if an error occurs
3125  */
3126 static Picture * remove_short(H264Context *h, int frame_num, int ref_mask){
3127     MpegEncContext * const s = &h->s;
3128     Picture *pic;
3129     int i;
3130
3131     if(s->avctx->debug&FF_DEBUG_MMCO)
3132         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3133
3134     pic = find_short(h, frame_num, &i);
3135     if (pic){
3136         if(unreference_pic(h, pic, ref_mask))
3137         remove_short_at_index(h, i);
3138     }
3139
3140     return pic;
3141 }
3142
3143 /**
3144  * Remove a picture from the long term reference list by its index in
3145  * that list.
3146  * @return the removed picture or NULL if an error occurs
3147  */
3148 static Picture * remove_long(H264Context *h, int i, int ref_mask){
3149     Picture *pic;
3150
3151     pic= h->long_ref[i];
3152     if (pic){
3153         if(unreference_pic(h, pic, ref_mask)){
3154             assert(h->long_ref[i]->long_ref == 1);
3155             h->long_ref[i]->long_ref= 0;
3156             h->long_ref[i]= NULL;
3157             h->long_ref_count--;
3158         }
3159     }
3160
3161     return pic;
3162 }
3163
3164 /**
3165  * print short term list
3166  */
3167 static void print_short_term(H264Context *h) {
3168     uint32_t i;
3169     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3170         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3171         for(i=0; i<h->short_ref_count; i++){
3172             Picture *pic= h->short_ref[i];
3173             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3174         }
3175     }
3176 }
3177
3178 /**
3179  * print long term list
3180  */
3181 static void print_long_term(H264Context *h) {
3182     uint32_t i;
3183     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3184         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3185         for(i = 0; i < 16; i++){
3186             Picture *pic= h->long_ref[i];
3187             if (pic) {
3188                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3189             }
3190         }
3191     }
3192 }
3193
3194 /**
3195  * Executes the reference picture marking (memory management control operations).
3196  */
3197 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3198     MpegEncContext * const s = &h->s;
3199     int i, j;
3200     int current_ref_assigned=0;
3201     Picture *pic;
3202
3203     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3204         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3205
3206     for(i=0; i<mmco_count; i++){
3207         int structure, frame_num;
3208         if(s->avctx->debug&FF_DEBUG_MMCO)
3209             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3210
3211         if(   mmco[i].opcode == MMCO_SHORT2UNUSED
3212            || mmco[i].opcode == MMCO_SHORT2LONG){
3213             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3214             pic = find_short(h, frame_num, &j);
3215             if(!pic){
3216                 if(mmco[i].opcode != MMCO_SHORT2LONG || !h->long_ref[mmco[i].long_arg]
3217                    || h->long_ref[mmco[i].long_arg]->frame_num != frame_num)
3218                 av_log(h->s.avctx, AV_LOG_ERROR, "mmco: unref short failure\n");
3219                 continue;
3220             }
3221         }
3222
3223         switch(mmco[i].opcode){
3224         case MMCO_SHORT2UNUSED:
3225             if(s->avctx->debug&FF_DEBUG_MMCO)
3226                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3227             remove_short(h, frame_num, structure ^ PICT_FRAME);
3228             break;
3229         case MMCO_SHORT2LONG:
3230                 if (h->long_ref[mmco[i].long_arg] != pic)
3231                     remove_long(h, mmco[i].long_arg, 0);
3232
3233                 remove_short_at_index(h, j);
3234                 h->long_ref[ mmco[i].long_arg ]= pic;
3235                 if (h->long_ref[ mmco[i].long_arg ]){
3236                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3237                     h->long_ref_count++;
3238                 }
3239             break;
3240         case MMCO_LONG2UNUSED:
3241             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3242             pic = h->long_ref[j];
3243             if (pic) {
3244                 remove_long(h, j, structure ^ PICT_FRAME);
3245             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3246                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3247             break;
3248         case MMCO_LONG:
3249                     // Comment below left from previous code as it is an interresting note.
3250                     /* First field in pair is in short term list or
3251                      * at a different long term index.
3252                      * This is not allowed; see 7.4.3.3, notes 2 and 3.
3253                      * Report the problem and keep the pair where it is,
3254                      * and mark this field valid.
3255                      */
3256
3257             if (h->long_ref[mmco[i].long_arg] != s->current_picture_ptr) {
3258                 remove_long(h, mmco[i].long_arg, 0);
3259
3260                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3261                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3262                 h->long_ref_count++;
3263             }
3264
3265             s->current_picture_ptr->reference |= s->picture_structure;
3266             current_ref_assigned=1;
3267             break;
3268         case MMCO_SET_MAX_LONG:
3269             assert(mmco[i].long_arg <= 16);
3270             // just remove the long term which index is greater than new max
3271             for(j = mmco[i].long_arg; j<16; j++){
3272                 remove_long(h, j, 0);
3273             }
3274             break;
3275         case MMCO_RESET:
3276             while(h->short_ref_count){
3277                 remove_short(h, h->short_ref[0]->frame_num, 0);
3278             }
3279             for(j = 0; j < 16; j++) {
3280                 remove_long(h, j, 0);
3281             }
3282             s->current_picture_ptr->poc=
3283             s->current_picture_ptr->field_poc[0]=
3284             s->current_picture_ptr->field_poc[1]=
3285             h->poc_lsb=
3286             h->poc_msb=
3287             h->frame_num=
3288             s->current_picture_ptr->frame_num= 0;
3289             break;
3290         default: assert(0);
3291         }
3292     }
3293
3294     if (!current_ref_assigned) {
3295         /* Second field of complementary field pair; the first field of
3296          * which is already referenced. If short referenced, it
3297          * should be first entry in short_ref. If not, it must exist
3298          * in long_ref; trying to put it on the short list here is an
3299          * error in the encoded bit stream (ref: 7.4.3.3, NOTE 2 and 3).
3300          */
3301         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3302             /* Just mark the second field valid */
3303             s->current_picture_ptr->reference = PICT_FRAME;
3304         } else if (s->current_picture_ptr->long_ref) {
3305             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3306                                              "assignment for second field "
3307                                              "in complementary field pair "
3308                                              "(first field is long term)\n");
3309         } else {
3310             pic= remove_short(h, s->current_picture_ptr->frame_num, 0);
3311             if(pic){
3312                 av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3313             }
3314
3315             if(h->short_ref_count)
3316                 memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3317
3318             h->short_ref[0]= s->current_picture_ptr;
3319             h->short_ref_count++;
3320             s->current_picture_ptr->reference |= s->picture_structure;
3321         }
3322     }
3323
3324     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3325
3326         /* We have too many reference frames, probably due to corrupted
3327          * stream. Need to discard one frame. Prevents overrun of the
3328          * short_ref and long_ref buffers.
3329          */
3330         av_log(h->s.avctx, AV_LOG_ERROR,
3331                "number of reference frames exceeds max (probably "
3332                "corrupt input), discarding one\n");
3333
3334         if (h->long_ref_count && !h->short_ref_count) {
3335             for (i = 0; i < 16; ++i)
3336                 if (h->long_ref[i])
3337                     break;
3338
3339             assert(i < 16);
3340             remove_long(h, i, 0);
3341         } else {
3342             pic = h->short_ref[h->short_ref_count - 1];
3343             remove_short(h, pic->frame_num, 0);
3344         }
3345     }
3346
3347     print_short_term(h);
3348     print_long_term(h);
3349     return 0;
3350 }
3351
3352 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3353     MpegEncContext * const s = &h->s;
3354     int i;
3355
3356     h->mmco_index= 0;
3357     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3358         s->broken_link= get_bits1(gb) -1;
3359         if(get_bits1(gb)){
3360             h->mmco[0].opcode= MMCO_LONG;
3361             h->mmco[0].long_arg= 0;
3362             h->mmco_index= 1;
3363         }
3364     }else{
3365         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3366             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3367                 MMCOOpcode opcode= get_ue_golomb(gb);
3368
3369                 h->mmco[i].opcode= opcode;
3370                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3371                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3372 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3373                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3374                         return -1;
3375                     }*/
3376                 }
3377                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3378                     unsigned int long_arg= get_ue_golomb(gb);
3379                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3380                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3381                         return -1;
3382                     }
3383                     h->mmco[i].long_arg= long_arg;
3384                 }
3385
3386                 if(opcode > (unsigned)MMCO_LONG){
3387                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3388                     return -1;
3389                 }
3390                 if(opcode == MMCO_END)
3391                     break;
3392             }
3393             h->mmco_index= i;
3394         }else{
3395             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3396
3397             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3398                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3399                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3400                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3401                 h->mmco_index= 1;
3402                 if (FIELD_PICTURE) {
3403                     h->mmco[0].short_pic_num *= 2;
3404                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3405                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3406                     h->mmco_index= 2;
3407                 }
3408             }
3409         }
3410     }
3411
3412     return 0;
3413 }
3414
3415 static int init_poc(H264Context *h){
3416     MpegEncContext * const s = &h->s;
3417     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3418     int field_poc[2];
3419     Picture *cur = s->current_picture_ptr;
3420
3421     h->frame_num_offset= h->prev_frame_num_offset;
3422     if(h->frame_num < h->prev_frame_num)
3423         h->frame_num_offset += max_frame_num;
3424
3425     if(h->sps.poc_type==0){
3426         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3427
3428         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3429             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3430         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3431             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3432         else
3433             h->poc_msb = h->prev_poc_msb;
3434 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3435         field_poc[0] =
3436         field_poc[1] = h->poc_msb + h->poc_lsb;
3437         if(s->picture_structure == PICT_FRAME)
3438             field_poc[1] += h->delta_poc_bottom;
3439     }else if(h->sps.poc_type==1){
3440         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3441         int i;
3442
3443         if(h->sps.poc_cycle_length != 0)
3444             abs_frame_num = h->frame_num_offset + h->frame_num;
3445         else
3446             abs_frame_num = 0;
3447
3448         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3449             abs_frame_num--;
3450
3451         expected_delta_per_poc_cycle = 0;
3452         for(i=0; i < h->sps.poc_cycle_length; i++)
3453             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3454
3455         if(abs_frame_num > 0){
3456             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3457             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3458
3459             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3460             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3461                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3462         } else
3463             expectedpoc = 0;
3464
3465         if(h->nal_ref_idc == 0)
3466             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3467
3468         field_poc[0] = expectedpoc + h->delta_poc[0];
3469         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3470
3471         if(s->picture_structure == PICT_FRAME)
3472             field_poc[1] += h->delta_poc[1];
3473     }else{
3474         int poc= 2*(h->frame_num_offset + h->frame_num);
3475
3476         if(!h->nal_ref_idc)
3477             poc--;
3478
3479         field_poc[0]= poc;
3480         field_poc[1]= poc;
3481     }
3482
3483     if(s->picture_structure != PICT_BOTTOM_FIELD)
3484         s->current_picture_ptr->field_poc[0]= field_poc[0];
3485     if(s->picture_structure != PICT_TOP_FIELD)
3486         s->current_picture_ptr->field_poc[1]= field_poc[1];
3487     cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3488
3489     return 0;
3490 }
3491
3492
3493 /**
3494  * initialize scan tables
3495  */
3496 static void init_scan_tables(H264Context *h){
3497     MpegEncContext * const s = &h->s;
3498     int i;
3499     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3500         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3501         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3502     }else{
3503         for(i=0; i<16; i++){
3504 #define T(x) (x>>2) | ((x<<2) & 0xF)
3505             h->zigzag_scan[i] = T(zigzag_scan[i]);
3506             h-> field_scan[i] = T( field_scan[i]);
3507 #undef T
3508         }
3509     }
3510     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3511         memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
3512         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3513         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3514         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3515     }else{
3516         for(i=0; i<64; i++){
3517 #define T(x) (x>>3) | ((x&7)<<3)
3518             h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
3519             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3520             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3521             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3522 #undef T
3523         }
3524     }
3525     if(h->sps.transform_bypass){ //FIXME same ugly
3526         h->zigzag_scan_q0          = zigzag_scan;
3527         h->zigzag_scan8x8_q0       = zigzag_scan8x8;
3528         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3529         h->field_scan_q0           = field_scan;
3530         h->field_scan8x8_q0        = field_scan8x8;
3531         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3532     }else{
3533         h->zigzag_scan_q0          = h->zigzag_scan;
3534         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3535         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3536         h->field_scan_q0           = h->field_scan;
3537         h->field_scan8x8_q0        = h->field_scan8x8;
3538         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3539     }
3540 }
3541
3542 /**
3543  * Replicates H264 "master" context to thread contexts.
3544  */
3545 static void clone_slice(H264Context *dst, H264Context *src)
3546 {
3547     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3548     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3549     dst->s.current_picture      = src->s.current_picture;
3550     dst->s.linesize             = src->s.linesize;
3551     dst->s.uvlinesize           = src->s.uvlinesize;
3552     dst->s.first_field          = src->s.first_field;
3553
3554     dst->prev_poc_msb           = src->prev_poc_msb;
3555     dst->prev_poc_lsb           = src->prev_poc_lsb;
3556     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3557     dst->prev_frame_num         = src->prev_frame_num;
3558     dst->short_ref_count        = src->short_ref_count;
3559
3560     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3561     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3562     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3563     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3564
3565     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3566     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3567 }
3568
3569 /**
3570  * decodes a slice header.
3571  * This will also call MPV_common_init() and frame_start() as needed.
3572  *
3573  * @param h h264context
3574  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3575  *
3576  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
3577  */
3578 static int decode_slice_header(H264Context *h, H264Context *h0){
3579     MpegEncContext * const s = &h->s;
3580     MpegEncContext * const s0 = &h0->s;
3581     unsigned int first_mb_in_slice;
3582     unsigned int pps_id;
3583     int num_ref_idx_active_override_flag;
3584     unsigned int slice_type, tmp, i, j;
3585     int default_ref_list_done = 0;
3586     int last_pic_structure;
3587
3588     s->dropable= h->nal_ref_idc == 0;
3589
3590     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3591         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3592         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3593     }else{
3594         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3595         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3596     }
3597
3598     first_mb_in_slice= get_ue_golomb(&s->gb);
3599
3600     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3601         h0->current_slice = 0;
3602         if (!s0->first_field)
3603             s->current_picture_ptr= NULL;
3604     }
3605
3606     slice_type= get_ue_golomb(&s->gb);
3607     if(slice_type > 9){
3608         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3609         return -1;
3610     }
3611     if(slice_type > 4){
3612         slice_type -= 5;
3613         h->slice_type_fixed=1;
3614     }else
3615         h->slice_type_fixed=0;
3616
3617     slice_type= golomb_to_pict_type[ slice_type ];
3618     if (slice_type == FF_I_TYPE
3619         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3620         default_ref_list_done = 1;
3621     }
3622     h->slice_type= slice_type;
3623     h->slice_type_nos= slice_type & 3;
3624
3625     s->pict_type= h->slice_type; // to make a few old functions happy, it's wrong though
3626     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3627         av_log(h->s.avctx, AV_LOG_ERROR,
3628                "B picture before any references, skipping\n");
3629         return -1;
3630     }
3631
3632     pps_id= get_ue_golomb(&s->gb);
3633     if(pps_id>=MAX_PPS_COUNT){
3634         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3635         return -1;
3636     }
3637     if(!h0->pps_buffers[pps_id]) {
3638         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing PPS referenced\n");
3639         return -1;
3640     }
3641     h->pps= *h0->pps_buffers[pps_id];
3642
3643     if(!h0->sps_buffers[h->pps.sps_id]) {
3644         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS referenced\n");
3645         return -1;
3646     }
3647     h->sps = *h0->sps_buffers[h->pps.sps_id];
3648
3649     if(h == h0 && h->dequant_coeff_pps != pps_id){
3650         h->dequant_coeff_pps = pps_id;
3651         init_dequant_tables(h);
3652     }
3653
3654     s->mb_width= h->sps.mb_width;
3655     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3656
3657     h->b_stride=  s->mb_width*4;
3658     h->b8_stride= s->mb_width*2;
3659
3660     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
3661     if(h->sps.frame_mbs_only_flag)
3662         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
3663     else
3664         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 3);
3665
3666     if (s->context_initialized
3667         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3668         if(h != h0)
3669             return -1;   // width / height changed during parallelized decoding
3670         free_tables(h);
3671         flush_dpb(s->avctx);
3672         MPV_common_end(s);
3673     }
3674     if (!s->context_initialized) {
3675         if(h != h0)
3676             return -1;  // we cant (re-)initialize context during parallel decoding
3677         if (MPV_common_init(s) < 0)
3678             return -1;
3679         s->first_field = 0;
3680
3681         init_scan_tables(h);
3682         alloc_tables(h);
3683
3684         for(i = 1; i < s->avctx->thread_count; i++) {
3685             H264Context *c;
3686             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3687             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3688             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3689             c->sps = h->sps;
3690             c->pps = h->pps;
3691             init_scan_tables(c);
3692             clone_tables(c, h);
3693         }
3694
3695         for(i = 0; i < s->avctx->thread_count; i++)
3696             if(context_init(h->thread_context[i]) < 0)
3697                 return -1;
3698
3699         s->avctx->width = s->width;
3700         s->avctx->height = s->height;
3701         s->avctx->sample_aspect_ratio= h->sps.sar;
3702         if(!s->avctx->sample_aspect_ratio.den)
3703             s->avctx->sample_aspect_ratio.den = 1;
3704
3705         if(h->sps.timing_info_present_flag){
3706             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
3707             if(h->x264_build > 0 && h->x264_build < 44)
3708                 s->avctx->time_base.den *= 2;
3709             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3710                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3711         }
3712     }
3713
3714     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3715
3716     h->mb_mbaff = 0;
3717     h->mb_aff_frame = 0;
3718     last_pic_structure = s0->picture_structure;
3719     if(h->sps.frame_mbs_only_flag){
3720         s->picture_structure= PICT_FRAME;
3721     }else{
3722         if(get_bits1(&s->gb)) { //field_pic_flag
3723             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3724         } else {
3725             s->picture_structure= PICT_FRAME;
3726             h->mb_aff_frame = h->sps.mb_aff;
3727         }
3728     }
3729     h->mb_field_decoding_flag= s->picture_structure != PICT_FRAME;
3730
3731     if(h0->current_slice == 0){
3732         while(h->frame_num !=  h->prev_frame_num &&
3733               h->frame_num != (h->prev_frame_num+1)%(1<<h->sps.log2_max_frame_num)){
3734             av_log(NULL, AV_LOG_DEBUG, "Frame num gap %d %d\n", h->frame_num, h->prev_frame_num);
3735             frame_start(h);
3736             h->prev_frame_num++;
3737             h->prev_frame_num %= 1<<h->sps.log2_max_frame_num;
3738             s->current_picture_ptr->frame_num= h->prev_frame_num;
3739             execute_ref_pic_marking(h, NULL, 0);
3740         }
3741
3742         /* See if we have a decoded first field looking for a pair... */
3743         if (s0->first_field) {
3744             assert(s0->current_picture_ptr);
3745             assert(s0->current_picture_ptr->data[0]);
3746             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
3747
3748             /* figure out if we have a complementary field pair */
3749             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
3750                 /*
3751                  * Previous field is unmatched. Don't display it, but let it
3752                  * remain for reference if marked as such.
3753                  */
3754                 s0->current_picture_ptr = NULL;
3755                 s0->first_field = FIELD_PICTURE;
3756
3757             } else {
3758                 if (h->nal_ref_idc &&
3759                         s0->current_picture_ptr->reference &&
3760                         s0->current_picture_ptr->frame_num != h->frame_num) {
3761                     /*
3762                      * This and previous field were reference, but had
3763                      * different frame_nums. Consider this field first in
3764                      * pair. Throw away previous field except for reference
3765                      * purposes.
3766                      */
3767                     s0->first_field = 1;
3768                     s0->current_picture_ptr = NULL;
3769
3770                 } else {
3771                     /* Second field in complementary pair */
3772                     s0->first_field = 0;
3773                 }
3774             }
3775
3776         } else {
3777             /* Frame or first field in a potentially complementary pair */
3778             assert(!s0->current_picture_ptr);
3779             s0->first_field = FIELD_PICTURE;
3780         }
3781
3782         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
3783             s0->first_field = 0;
3784             return -1;
3785         }
3786     }
3787     if(h != h0)
3788         clone_slice(h, h0);
3789
3790     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
3791
3792     assert(s->mb_num == s->mb_width * s->mb_height);
3793     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
3794        first_mb_in_slice                    >= s->mb_num){
3795         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
3796         return -1;
3797     }
3798     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
3799     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
3800     if (s->picture_structure == PICT_BOTTOM_FIELD)
3801         s->resync_mb_y = s->mb_y = s->mb_y + 1;
3802     assert(s->mb_y < s->mb_height);
3803
3804     if(s->picture_structure==PICT_FRAME){
3805         h->curr_pic_num=   h->frame_num;
3806         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
3807     }else{
3808         h->curr_pic_num= 2*h->frame_num + 1;
3809         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
3810     }
3811
3812     if(h->nal_unit_type == NAL_IDR_SLICE){
3813         get_ue_golomb(&s->gb); /* idr_pic_id */
3814     }
3815
3816     if(h->sps.poc_type==0){
3817         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
3818
3819         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
3820             h->delta_poc_bottom= get_se_golomb(&s->gb);
3821         }
3822     }
3823
3824     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
3825         h->delta_poc[0]= get_se_golomb(&s->gb);
3826
3827         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
3828             h->delta_poc[1]= get_se_golomb(&s->gb);
3829     }
3830
3831     init_poc(h);
3832
3833     if(h->pps.redundant_pic_cnt_present){
3834         h->redundant_pic_count= get_ue_golomb(&s->gb);
3835     }
3836
3837     //set defaults, might be overridden a few lines later
3838     h->ref_count[0]= h->pps.ref_count[0];
3839     h->ref_count[1]= h->pps.ref_count[1];
3840
3841     if(h->slice_type_nos != FF_I_TYPE){
3842         if(h->slice_type_nos == FF_B_TYPE){
3843             h->direct_spatial_mv_pred= get_bits1(&s->gb);
3844         }
3845         num_ref_idx_active_override_flag= get_bits1(&s->gb);
3846
3847         if(num_ref_idx_active_override_flag){
3848             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
3849             if(h->slice_type_nos==FF_B_TYPE)
3850                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
3851
3852             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
3853                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
3854                 h->ref_count[0]= h->ref_count[1]= 1;
3855                 return -1;
3856             }
3857         }
3858         if(h->slice_type_nos == FF_B_TYPE)
3859             h->list_count= 2;
3860         else
3861             h->list_count= 1;
3862     }else
3863         h->list_count= 0;
3864
3865     if(!default_ref_list_done){
3866         fill_default_ref_list(h);
3867     }
3868
3869     if(h->slice_type_nos!=FF_I_TYPE && decode_ref_pic_list_reordering(h) < 0)
3870         return -1;
3871
3872     if(h->slice_type_nos!=FF_I_TYPE){
3873         s->last_picture_ptr= &h->ref_list[0][0];
3874         ff_copy_picture(&s->last_picture, s->last_picture_ptr);
3875     }
3876     if(h->slice_type_nos==FF_B_TYPE){
3877         s->next_picture_ptr= &h->ref_list[1][0];
3878         ff_copy_picture(&s->next_picture, s->next_picture_ptr);
3879     }
3880
3881     if(   (h->pps.weighted_pred          && h->slice_type_nos == FF_P_TYPE )
3882        ||  (h->pps.weighted_bipred_idc==1 && h->slice_type_nos== FF_B_TYPE ) )
3883         pred_weight_table(h);
3884     else if(h->pps.weighted_bipred_idc==2 && h->slice_type_nos== FF_B_TYPE)
3885         implicit_weight_table(h);
3886     else
3887         h->use_weight = 0;
3888
3889     if(h->nal_ref_idc)
3890         decode_ref_pic_marking(h0, &s->gb);
3891
3892     if(FRAME_MBAFF)
3893         fill_mbaff_ref_list(h);
3894
3895     if(h->slice_type_nos==FF_B_TYPE && !h->direct_spatial_mv_pred)
3896         direct_dist_scale_factor(h);
3897     direct_ref_list_init(h);
3898
3899     if( h->slice_type_nos != FF_I_TYPE && h->pps.cabac ){
3900         tmp = get_ue_golomb(&s->gb);
3901         if(tmp > 2){
3902             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
3903             return -1;
3904         }
3905         h->cabac_init_idc= tmp;
3906     }
3907
3908     h->last_qscale_diff = 0;
3909     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
3910     if(tmp>51){
3911         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
3912         return -1;
3913     }
3914     s->qscale= tmp;
3915     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
3916     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
3917     //FIXME qscale / qp ... stuff
3918     if(h->slice_type == FF_SP_TYPE){
3919         get_bits1(&s->gb); /* sp_for_switch_flag */
3920     }
3921     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
3922         get_se_golomb(&s->gb); /* slice_qs_delta */
3923     }
3924
3925     h->deblocking_filter = 1;
3926     h->slice_alpha_c0_offset = 0;
3927     h->slice_beta_offset = 0;
3928     if( h->pps.deblocking_filter_parameters_present ) {
3929         tmp= get_ue_golomb(&s->gb);
3930         if(tmp > 2){
3931             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
3932             return -1;
3933         }
3934         h->deblocking_filter= tmp;
3935         if(h->deblocking_filter < 2)
3936             h->deblocking_filter^= 1; // 1<->0
3937
3938         if( h->deblocking_filter ) {
3939             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
3940             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
3941         }
3942     }
3943
3944     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
3945        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type_nos != FF_I_TYPE)
3946        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type_nos == FF_B_TYPE)
3947        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
3948         h->deblocking_filter= 0;
3949
3950     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
3951         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
3952             /* Cheat slightly for speed:
3953                Do not bother to deblock across slices. */
3954             h->deblocking_filter = 2;
3955         } else {
3956             h0->max_contexts = 1;
3957             if(!h0->single_decode_warning) {
3958                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
3959                 h0->single_decode_warning = 1;
3960             }
3961             if(h != h0)
3962                 return 1; // deblocking switched inside frame
3963         }
3964     }
3965
3966 #if 0 //FMO
3967     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
3968         slice_group_change_cycle= get_bits(&s->gb, ?);
3969 #endif
3970
3971     h0->last_slice_type = slice_type;
3972     h->slice_num = ++h0->current_slice;
3973     if(h->slice_num >= MAX_SLICES){
3974         av_log(s->avctx, AV_LOG_ERROR, "Too many slices, increase MAX_SLICES and recompile\n");
3975     }
3976
3977     for(j=0; j<2; j++){
3978         int *ref2frm= h->ref2frm[h->slice_num&(MAX_SLICES-1)][j];
3979         ref2frm[0]=
3980         ref2frm[1]= -1;
3981         for(i=0; i<16; i++)
3982             ref2frm[i+2]= 4*h->ref_list[j][i].frame_num
3983                           +(h->ref_list[j][i].reference&3);
3984         ref2frm[18+0]=
3985         ref2frm[18+1]= -1;
3986         for(i=16; i<48; i++)
3987             ref2frm[i+4]= 4*h->ref_list[j][i].frame_num
3988                           +(h->ref_list[j][i].reference&3);
3989     }
3990
3991     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
3992     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
3993
3994     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
3995         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c%s%s pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
3996                h->slice_num,
3997                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
3998                first_mb_in_slice,
3999                av_get_pict_type_char(h->slice_type), h->slice_type_fixed ? " fix" : "", h->nal_unit_type == NAL_IDR_SLICE ? " IDR" : "",
4000                pps_id, h->frame_num,
4001                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4002                h->ref_count[0], h->ref_count[1],
4003                s->qscale,
4004                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4005                h->use_weight,
4006                h->use_weight==1 && h->use_weight_chroma ? "c" : "",
4007                h->slice_type == FF_B_TYPE ? (h->direct_spatial_mv_pred ? "SPAT" : "TEMP") : ""
4008                );
4009     }
4010
4011     return 0;
4012 }
4013
4014 /**
4015  *
4016  */
4017 static inline int get_level_prefix(GetBitContext *gb){
4018     unsigned int buf;
4019     int log;
4020
4021     OPEN_READER(re, gb);
4022     UPDATE_CACHE(re, gb);
4023     buf=GET_CACHE(re, gb);
4024
4025     log= 32 - av_log2(buf);
4026 #ifdef TRACE
4027     print_bin(buf>>(32-log), log);
4028     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4029 #endif
4030
4031     LAST_SKIP_BITS(re, gb, log);
4032     CLOSE_READER(re, gb);
4033
4034     return log-1;
4035 }
4036
4037 static inline int get_dct8x8_allowed(H264Context *h){
4038     int i;
4039     for(i=0; i<4; i++){
4040         if(!IS_SUB_8X8(h->sub_mb_type[i])
4041            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4042             return 0;
4043     }
4044     return 1;
4045 }
4046
4047 /**
4048  * decodes a residual block.
4049  * @param n block index
4050  * @param scantable scantable
4051  * @param max_coeff number of coefficients in the block
4052  * @return <0 if an error occurred
4053  */
4054 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4055     MpegEncContext * const s = &h->s;
4056     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4057     int level[16];
4058     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4059
4060     //FIXME put trailing_onex into the context
4061
4062     if(n == CHROMA_DC_BLOCK_INDEX){
4063         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4064         total_coeff= coeff_token>>2;
4065     }else{
4066         if(n == LUMA_DC_BLOCK_INDEX){
4067             total_coeff= pred_non_zero_count(h, 0);
4068             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4069             total_coeff= coeff_token>>2;
4070         }else{
4071             total_coeff= pred_non_zero_count(h, n);
4072             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4073             total_coeff= coeff_token>>2;
4074             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4075         }
4076     }
4077
4078     //FIXME set last_non_zero?
4079
4080     if(total_coeff==0)
4081         return 0;
4082     if(total_coeff > (unsigned)max_coeff) {
4083         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4084         return -1;
4085     }
4086
4087     trailing_ones= coeff_token&3;
4088     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4089     assert(total_coeff<=16);
4090
4091     i = show_bits(gb, 3);
4092     skip_bits(gb, trailing_ones);
4093     level[0] = 1-((i&4)>>1);
4094     level[1] = 1-((i&2)   );
4095     level[2] = 1-((i&1)<<1);
4096
4097     if(trailing_ones<total_coeff) {
4098         int level_code, mask;
4099         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4100         int prefix= get_level_prefix(gb);
4101
4102         //first coefficient has suffix_length equal to 0 or 1
4103         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4104             if(suffix_length)
4105                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4106             else
4107                 level_code= (prefix<<suffix_length); //part
4108         }else if(prefix==14){
4109             if(suffix_length)
4110                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4111             else
4112                 level_code= prefix + get_bits(gb, 4); //part
4113         }else{
4114             level_code= (15<<suffix_length) + get_bits(gb, prefix-3); //part
4115             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4116             if(prefix>=16)
4117                 level_code += (1<<(prefix-3))-4096;
4118         }
4119
4120         if(trailing_ones < 3) level_code += 2;
4121
4122         suffix_length = 1;
4123         if(level_code > 5)
4124             suffix_length++;
4125         mask= -(level_code&1);
4126         level[trailing_ones]= (((2+level_code)>>1) ^ mask) - mask;
4127
4128         //remaining coefficients have suffix_length > 0
4129         for(i=trailing_ones+1;i<total_coeff;i++) {
4130             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
4131             prefix = get_level_prefix(gb);
4132             if(prefix<15){
4133                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4134             }else{
4135                 level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
4136                 if(prefix>=16)
4137                     level_code += (1<<(prefix-3))-4096;
4138             }
4139             mask= -(level_code&1);
4140             level[i]= (((2+level_code)>>1) ^ mask) - mask;
4141             if(level_code > suffix_limit[suffix_length])
4142                 suffix_length++;
4143         }
4144     }
4145
4146     if(total_coeff == max_coeff)
4147         zeros_left=0;
4148     else{
4149         if(n == CHROMA_DC_BLOCK_INDEX)
4150             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4151         else
4152             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4153     }
4154
4155     coeff_num = zeros_left + total_coeff - 1;
4156     j = scantable[coeff_num];
4157     if(n > 24){
4158         block[j] = level[0];
4159         for(i=1;i<total_coeff;i++) {
4160             if(zeros_left <= 0)
4161                 run_before = 0;
4162             else if(zeros_left < 7){
4163                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4164             }else{
4165                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4166             }
4167             zeros_left -= run_before;
4168             coeff_num -= 1 + run_before;
4169             j= scantable[ coeff_num ];
4170
4171             block[j]= level[i];
4172         }
4173     }else{
4174         block[j] = (level[0] * qmul[j] + 32)>>6;
4175         for(i=1;i<total_coeff;i++) {
4176             if(zeros_left <= 0)
4177                 run_before = 0;
4178             else if(zeros_left < 7){
4179                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4180             }else{
4181                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4182             }
4183             zeros_left -= run_before;
4184             coeff_num -= 1 + run_before;
4185             j= scantable[ coeff_num ];
4186
4187             block[j]= (level[i] * qmul[j] + 32)>>6;
4188         }
4189     }
4190
4191     if(zeros_left<0){
4192         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4193         return -1;
4194     }
4195
4196     return 0;
4197 }
4198
4199 static void predict_field_decoding_flag(H264Context *h){
4200     MpegEncContext * const s = &h->s;
4201     const int mb_xy= h->mb_xy;
4202     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4203                 ? s->current_picture.mb_type[mb_xy-1]
4204                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4205                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4206                 : 0;
4207     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4208 }
4209
4210 /**
4211  * decodes a P_SKIP or B_SKIP macroblock
4212  */
4213 static void decode_mb_skip(H264Context *h){
4214     MpegEncContext * const s = &h->s;
4215     const int mb_xy= h->mb_xy;
4216     int mb_type=0;
4217
4218     memset(h->non_zero_count[mb_xy], 0, 16);
4219     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4220
4221     if(MB_FIELD)
4222         mb_type|= MB_TYPE_INTERLACED;
4223
4224     if( h->slice_type_nos == FF_B_TYPE )
4225     {
4226         // just for fill_caches. pred_direct_motion will set the real mb_type
4227         mb_type|= MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4228
4229         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4230         pred_direct_motion(h, &mb_type);
4231         mb_type|= MB_TYPE_SKIP;
4232     }
4233     else
4234     {
4235         int mx, my;
4236         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4237
4238         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4239         pred_pskip_motion(h, &mx, &my);
4240         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4241         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4242     }
4243
4244     write_back_motion(h, mb_type);
4245     s->current_picture.mb_type[mb_xy]= mb_type;
4246     s->current_picture.qscale_table[mb_xy]= s->qscale;
4247     h->slice_table[ mb_xy ]= h->slice_num;
4248     h->prev_mb_skipped= 1;
4249 }
4250
4251 /**
4252  * decodes a macroblock
4253  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4254  */
4255 static int decode_mb_cavlc(H264Context *h){
4256     MpegEncContext * const s = &h->s;
4257     int mb_xy;
4258     int partition_count;
4259     unsigned int mb_type, cbp;
4260     int dct8x8_allowed= h->pps.transform_8x8_mode;
4261
4262     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4263
4264     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
4265
4266     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4267     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4268                 down the code */
4269     if(h->slice_type_nos != FF_I_TYPE){
4270         if(s->mb_skip_run==-1)
4271             s->mb_skip_run= get_ue_golomb(&s->gb);
4272
4273         if (s->mb_skip_run--) {
4274             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4275                 if(s->mb_skip_run==0)
4276                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4277                 else
4278                     predict_field_decoding_flag(h);
4279             }
4280             decode_mb_skip(h);
4281             return 0;
4282         }
4283     }
4284     if(FRAME_MBAFF){
4285         if( (s->mb_y&1) == 0 )
4286             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4287     }
4288
4289     h->prev_mb_skipped= 0;
4290
4291     mb_type= get_ue_golomb(&s->gb);
4292     if(h->slice_type_nos == FF_B_TYPE){
4293         if(mb_type < 23){
4294             partition_count= b_mb_type_info[mb_type].partition_count;
4295             mb_type=         b_mb_type_info[mb_type].type;
4296         }else{
4297             mb_type -= 23;
4298             goto decode_intra_mb;
4299         }
4300     }else if(h->slice_type_nos == FF_P_TYPE){
4301         if(mb_type < 5){
4302             partition_count= p_mb_type_info[mb_type].partition_count;
4303             mb_type=         p_mb_type_info[mb_type].type;
4304         }else{
4305             mb_type -= 5;
4306             goto decode_intra_mb;
4307         }
4308     }else{
4309        assert(h->slice_type_nos == FF_I_TYPE);
4310         if(h->slice_type == FF_SI_TYPE && mb_type)
4311             mb_type--;
4312 decode_intra_mb:
4313         if(mb_type > 25){
4314             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4315             return -1;
4316         }
4317         partition_count=0;
4318         cbp= i_mb_type_info[mb_type].cbp;
4319         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4320         mb_type= i_mb_type_info[mb_type].type;
4321     }
4322
4323     if(MB_FIELD)
4324         mb_type |= MB_TYPE_INTERLACED;
4325
4326     h->slice_table[ mb_xy ]= h->slice_num;
4327
4328     if(IS_INTRA_PCM(mb_type)){
4329         unsigned int x;
4330
4331         // We assume these blocks are very rare so we do not optimize it.
4332         align_get_bits(&s->gb);
4333
4334         // The pixels are stored in the same order as levels in h->mb array.
4335         for(x=0; x < (CHROMA ? 384 : 256); x++){
4336             ((uint8_t*)h->mb)[x]= get_bits(&s->gb, 8);
4337         }
4338
4339         // In deblocking, the quantizer is 0
4340         s->current_picture.qscale_table[mb_xy]= 0;
4341         // All coeffs are present
4342         memset(h->non_zero_count[mb_xy], 16, 16);
4343
4344         s->current_picture.mb_type[mb_xy]= mb_type;
4345         return 0;
4346     }
4347
4348     if(MB_MBAFF){
4349         h->ref_count[0] <<= 1;
4350         h->ref_count[1] <<= 1;
4351     }
4352
4353     fill_caches(h, mb_type, 0);
4354
4355     //mb_pred
4356     if(IS_INTRA(mb_type)){
4357         int pred_mode;
4358 //            init_top_left_availability(h);
4359         if(IS_INTRA4x4(mb_type)){
4360             int i;
4361             int di = 1;
4362             if(dct8x8_allowed && get_bits1(&s->gb)){
4363                 mb_type |= MB_TYPE_8x8DCT;
4364                 di = 4;
4365             }
4366
4367 //                fill_intra4x4_pred_table(h);
4368             for(i=0; i<16; i+=di){
4369                 int mode= pred_intra_mode(h, i);
4370
4371                 if(!get_bits1(&s->gb)){
4372                     const int rem_mode= get_bits(&s->gb, 3);
4373                     mode = rem_mode + (rem_mode >= mode);
4374                 }
4375
4376                 if(di==4)
4377                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4378                 else
4379                     h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4380             }
4381             write_back_intra_pred_mode(h);
4382             if( check_intra4x4_pred_mode(h) < 0)
4383                 return -1;
4384         }else{
4385             h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4386             if(h->intra16x16_pred_mode < 0)
4387                 return -1;
4388         }
4389         if(CHROMA){
4390             pred_mode= check_intra_pred_mode(h, get_ue_golomb(&s->gb));
4391             if(pred_mode < 0)
4392                 return -1;
4393             h->chroma_pred_mode= pred_mode;
4394         }
4395     }else if(partition_count==4){
4396         int i, j, sub_partition_count[4], list, ref[2][4];
4397
4398         if(h->slice_type_nos == FF_B_TYPE){
4399             for(i=0; i<4; i++){
4400                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4401                 if(h->sub_mb_type[i] >=13){
4402                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4403                     return -1;
4404                 }
4405                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4406                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4407             }
4408             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4409                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4410                 pred_direct_motion(h, &mb_type);
4411                 h->ref_cache[0][scan8[4]] =
4412                 h->ref_cache[1][scan8[4]] =
4413                 h->ref_cache[0][scan8[12]] =
4414                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4415             }
4416         }else{
4417             assert(h->slice_type_nos == FF_P_TYPE); //FIXME SP correct ?
4418             for(i=0; i<4; i++){
4419                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4420                 if(h->sub_mb_type[i] >=4){
4421                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4422                     return -1;
4423                 }
4424                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4425                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4426             }
4427         }
4428
4429         for(list=0; list<h->list_count; list++){
4430             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4431             for(i=0; i<4; i++){
4432                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4433                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4434                     unsigned int tmp = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
4435                     if(tmp>=ref_count){
4436                         av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4437                         return -1;
4438                     }
4439                     ref[list][i]= tmp;
4440                 }else{
4441                  //FIXME
4442                     ref[list][i] = -1;
4443                 }
4444             }
4445         }
4446
4447         if(dct8x8_allowed)
4448             dct8x8_allowed = get_dct8x8_allowed(h);
4449
4450         for(list=0; list<h->list_count; list++){
4451             for(i=0; i<4; i++){
4452                 if(IS_DIRECT(h->sub_mb_type[i])) {
4453                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4454                     continue;
4455                 }
4456                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4457                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4458
4459                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4460                     const int sub_mb_type= h->sub_mb_type[i];
4461                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4462                     for(j=0; j<sub_partition_count[i]; j++){
4463                         int mx, my;
4464                         const int index= 4*i + block_width*j;
4465                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4466                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4467                         mx += get_se_golomb(&s->gb);
4468                         my += get_se_golomb(&s->gb);
4469                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4470
4471                         if(IS_SUB_8X8(sub_mb_type)){
4472                             mv_cache[ 1 ][0]=
4473                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4474                             mv_cache[ 1 ][1]=
4475                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4476                         }else if(IS_SUB_8X4(sub_mb_type)){
4477                             mv_cache[ 1 ][0]= mx;
4478                             mv_cache[ 1 ][1]= my;
4479                         }else if(IS_SUB_4X8(sub_mb_type)){
4480                             mv_cache[ 8 ][0]= mx;
4481                             mv_cache[ 8 ][1]= my;
4482                         }
4483                         mv_cache[ 0 ][0]= mx;
4484                         mv_cache[ 0 ][1]= my;
4485                     }
4486                 }else{
4487                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4488                     p[0] = p[1]=
4489                     p[8] = p[9]= 0;
4490                 }
4491             }
4492         }
4493     }else if(IS_DIRECT(mb_type)){
4494         pred_direct_motion(h, &mb_type);
4495         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4496     }else{
4497         int list, mx, my, i;
4498          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4499         if(IS_16X16(mb_type)){
4500             for(list=0; list<h->list_count; list++){
4501                     unsigned int val;
4502                     if(IS_DIR(mb_type, 0, list)){
4503                         val= get_te0_golomb(&s->gb, h->ref_count[list]);
4504                         if(val >= h->ref_count[list]){
4505                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4506                             return -1;
4507                         }
4508                     }else
4509                         val= LIST_NOT_USED&0xFF;
4510                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4511             }
4512             for(list=0; list<h->list_count; list++){
4513                 unsigned int val;
4514                 if(IS_DIR(mb_type, 0, list)){
4515                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4516                     mx += get_se_golomb(&s->gb);
4517                     my += get_se_golomb(&s->gb);
4518                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4519
4520                     val= pack16to32(mx,my);
4521                 }else
4522                     val=0;
4523                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4524             }
4525         }
4526         else if(IS_16X8(mb_type)){
4527             for(list=0; list<h->list_count; list++){
4528                     for(i=0; i<2; i++){
4529                         unsigned int val;
4530                         if(IS_DIR(mb_type, i, list)){
4531                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4532                             if(val >= h->ref_count[list]){
4533                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4534                                 return -1;
4535                             }
4536                         }else
4537                             val= LIST_NOT_USED&0xFF;
4538                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4539                     }
4540             }
4541             for(list=0; list<h->list_count; list++){
4542                 for(i=0; i<2; i++){
4543                     unsigned int val;
4544                     if(IS_DIR(mb_type, i, list)){
4545                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4546                         mx += get_se_golomb(&s->gb);
4547                         my += get_se_golomb(&s->gb);
4548                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4549
4550                         val= pack16to32(mx,my);
4551                     }else
4552                         val=0;
4553                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4554                 }
4555             }
4556         }else{
4557             assert(IS_8X16(mb_type));
4558             for(list=0; list<h->list_count; list++){
4559                     for(i=0; i<2; i++){
4560                         unsigned int val;
4561                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4562                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4563                             if(val >= h->ref_count[list]){
4564                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4565                                 return -1;
4566                             }
4567                         }else
4568                             val= LIST_NOT_USED&0xFF;
4569                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4570                     }
4571             }
4572             for(list=0; list<h->list_count; list++){
4573                 for(i=0; i<2; i++){
4574                     unsigned int val;
4575                     if(IS_DIR(mb_type, i, list)){
4576                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4577                         mx += get_se_golomb(&s->gb);
4578                         my += get_se_golomb(&s->gb);
4579                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4580
4581                         val= pack16to32(mx,my);
4582                     }else
4583                         val=0;
4584                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4585                 }
4586             }
4587         }
4588     }
4589
4590     if(IS_INTER(mb_type))
4591         write_back_motion(h, mb_type);
4592
4593     if(!IS_INTRA16x16(mb_type)){
4594         cbp= get_ue_golomb(&s->gb);
4595         if(cbp > 47){
4596             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4597             return -1;
4598         }
4599
4600         if(CHROMA){
4601             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp[cbp];
4602             else                     cbp= golomb_to_inter_cbp   [cbp];
4603         }else{
4604             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp_gray[cbp];
4605             else                     cbp= golomb_to_inter_cbp_gray[cbp];
4606         }
4607     }
4608     h->cbp = cbp;
4609
4610     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4611         if(get_bits1(&s->gb)){
4612             mb_type |= MB_TYPE_8x8DCT;
4613             h->cbp_table[mb_xy]= cbp;
4614         }
4615     }
4616     s->current_picture.mb_type[mb_xy]= mb_type;
4617
4618     if(cbp || IS_INTRA16x16(mb_type)){
4619         int i8x8, i4x4, chroma_idx;
4620         int dquant;
4621         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4622         const uint8_t *scan, *scan8x8, *dc_scan;
4623
4624 //        fill_non_zero_count_cache(h);
4625
4626         if(IS_INTERLACED(mb_type)){
4627             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4628             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4629             dc_scan= luma_dc_field_scan;
4630         }else{
4631             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4632             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4633             dc_scan= luma_dc_zigzag_scan;
4634         }
4635
4636         dquant= get_se_golomb(&s->gb);
4637
4638         if( dquant > 25 || dquant < -26 ){
4639             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4640             return -1;
4641         }
4642
4643         s->qscale += dquant;
4644         if(((unsigned)s->qscale) > 51){
4645             if(s->qscale<0) s->qscale+= 52;
4646             else            s->qscale-= 52;
4647         }
4648
4649         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4650         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4651         if(IS_INTRA16x16(mb_type)){
4652             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4653                 return -1; //FIXME continue if partitioned and other return -1 too
4654             }
4655
4656             assert((cbp&15) == 0 || (cbp&15) == 15);
4657
4658             if(cbp&15){
4659                 for(i8x8=0; i8x8<4; i8x8++){
4660                     for(i4x4=0; i4x4<4; i4x4++){
4661                         const int index= i4x4 + 4*i8x8;
4662                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4663                             return -1;
4664                         }
4665                     }
4666                 }
4667             }else{
4668                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4669             }
4670         }else{
4671             for(i8x8=0; i8x8<4; i8x8++){
4672                 if(cbp & (1<<i8x8)){
4673                     if(IS_8x8DCT(mb_type)){
4674                         DCTELEM *buf = &h->mb[64*i8x8];
4675                         uint8_t *nnz;
4676                         for(i4x4=0; i4x4<4; i4x4++){
4677                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4678                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4679                                 return -1;
4680                         }
4681                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4682                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4683                     }else{
4684                         for(i4x4=0; i4x4<4; i4x4++){
4685                             const int index= i4x4 + 4*i8x8;
4686
4687                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4688                                 return -1;
4689                             }
4690                         }
4691                     }
4692                 }else{
4693                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4694                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4695                 }
4696             }
4697         }
4698
4699         if(cbp&0x30){
4700             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4701                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4702                     return -1;
4703                 }
4704         }
4705
4706         if(cbp&0x20){
4707             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4708                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4709                 for(i4x4=0; i4x4<4; i4x4++){
4710                     const int index= 16 + 4*chroma_idx + i4x4;
4711                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4712                         return -1;
4713                     }
4714                 }
4715             }
4716         }else{
4717             uint8_t * const nnz= &h->non_zero_count_cache[0];
4718             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4719             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4720         }
4721     }else{
4722         uint8_t * const nnz= &h->non_zero_count_cache[0];
4723         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4724         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4725         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4726     }
4727     s->current_picture.qscale_table[mb_xy]= s->qscale;
4728     write_back_non_zero_count(h);
4729
4730     if(MB_MBAFF){
4731         h->ref_count[0] >>= 1;
4732         h->ref_count[1] >>= 1;
4733     }
4734
4735     return 0;
4736 }
4737
4738 static int decode_cabac_field_decoding_flag(H264Context *h) {
4739     MpegEncContext * const s = &h->s;
4740     const int mb_x = s->mb_x;
4741     const int mb_y = s->mb_y & ~1;
4742     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4743     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4744
4745     unsigned int ctx = 0;
4746
4747     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
4748         ctx += 1;
4749     }
4750     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
4751         ctx += 1;
4752     }
4753
4754     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
4755 }
4756
4757 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4758     uint8_t *state= &h->cabac_state[ctx_base];
4759     int mb_type;
4760
4761     if(intra_slice){
4762         MpegEncContext * const s = &h->s;
4763         const int mba_xy = h->left_mb_xy[0];
4764         const int mbb_xy = h->top_mb_xy;
4765         int ctx=0;
4766         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
4767             ctx++;
4768         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
4769             ctx++;
4770         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
4771             return 0;   /* I4x4 */
4772         state += 2;
4773     }else{
4774         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
4775             return 0;   /* I4x4 */
4776     }
4777
4778     if( get_cabac_terminate( &h->cabac ) )
4779         return 25;  /* PCM */
4780
4781     mb_type = 1; /* I16x16 */
4782     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
4783     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
4784         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
4785     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
4786     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
4787     return mb_type;
4788 }
4789
4790 static int decode_cabac_mb_type( H264Context *h ) {
4791     MpegEncContext * const s = &h->s;
4792
4793     if( h->slice_type_nos == FF_I_TYPE ) {
4794         return decode_cabac_intra_mb_type(h, 3, 1);
4795     } else if( h->slice_type_nos == FF_P_TYPE ) {
4796         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
4797             /* P-type */
4798             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
4799                 /* P_L0_D16x16, P_8x8 */
4800                 return 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
4801             } else {
4802                 /* P_L0_D8x16, P_L0_D16x8 */
4803                 return 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
4804             }
4805         } else {
4806             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
4807         }
4808     } else {
4809         const int mba_xy = h->left_mb_xy[0];
4810         const int mbb_xy = h->top_mb_xy;
4811         int ctx = 0;
4812         int bits;
4813         assert(h->slice_type_nos == FF_B_TYPE);
4814
4815         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
4816             ctx++;
4817         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
4818             ctx++;
4819
4820         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
4821             return 0; /* B_Direct_16x16 */
4822
4823         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
4824             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
4825         }
4826
4827         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
4828         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
4829         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
4830         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4831         if( bits < 8 )
4832             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
4833         else if( bits == 13 ) {
4834             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
4835         } else if( bits == 14 )
4836             return 11; /* B_L1_L0_8x16 */
4837         else if( bits == 15 )
4838             return 22; /* B_8x8 */
4839
4840         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4841         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
4842     }
4843 }
4844
4845 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
4846     MpegEncContext * const s = &h->s;
4847     int mba_xy, mbb_xy;
4848     int ctx = 0;
4849
4850     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
4851         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
4852         mba_xy = mb_xy - 1;
4853         if( (mb_y&1)
4854             && h->slice_table[mba_xy] == h->slice_num
4855             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
4856             mba_xy += s->mb_stride;
4857         if( MB_FIELD ){
4858             mbb_xy = mb_xy - s->mb_stride;
4859             if( !(mb_y&1)
4860                 && h->slice_table[mbb_xy] == h->slice_num
4861                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
4862                 mbb_xy -= s->mb_stride;
4863         }else
4864             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
4865     }else{
4866         int mb_xy = h->mb_xy;
4867         mba_xy = mb_xy - 1;
4868         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
4869     }
4870
4871     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
4872         ctx++;
4873     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
4874         ctx++;
4875
4876     if( h->slice_type_nos == FF_B_TYPE )
4877         ctx += 13;
4878     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
4879 }
4880
4881 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
4882     int mode = 0;
4883
4884     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
4885         return pred_mode;
4886
4887     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
4888     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
4889     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
4890
4891     if( mode >= pred_mode )
4892         return mode + 1;
4893     else
4894         return mode;
4895 }
4896
4897 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
4898     const int mba_xy = h->left_mb_xy[0];
4899     const int mbb_xy = h->top_mb_xy;
4900
4901     int ctx = 0;
4902
4903     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
4904     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
4905         ctx++;
4906
4907     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
4908         ctx++;
4909
4910     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
4911         return 0;
4912
4913     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4914         return 1;
4915     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4916         return 2;
4917     else
4918         return 3;
4919 }
4920
4921 static int decode_cabac_mb_cbp_luma( H264Context *h) {
4922     int cbp_b, cbp_a, ctx, cbp = 0;
4923
4924     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
4925     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
4926
4927     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
4928     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
4929     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
4930     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
4931     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
4932     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
4933     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
4934     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
4935     return cbp;
4936 }
4937 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
4938     int ctx;
4939     int cbp_a, cbp_b;
4940
4941     cbp_a = (h->left_cbp>>4)&0x03;
4942     cbp_b = (h-> top_cbp>>4)&0x03;
4943
4944     ctx = 0;
4945     if( cbp_a > 0 ) ctx++;
4946     if( cbp_b > 0 ) ctx += 2;
4947     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
4948         return 0;
4949
4950     ctx = 4;
4951     if( cbp_a == 2 ) ctx++;
4952     if( cbp_b == 2 ) ctx += 2;
4953     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
4954 }
4955 static int decode_cabac_mb_dqp( H264Context *h) {
4956     int   ctx = 0;
4957     int   val = 0;
4958
4959     if( h->last_qscale_diff != 0 )
4960         ctx++;
4961
4962     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
4963         if( ctx < 2 )
4964             ctx = 2;
4965         else
4966             ctx = 3;
4967         val++;
4968         if(val > 102) //prevent infinite loop
4969             return INT_MIN;
4970     }
4971
4972     if( val&0x01 )
4973         return (val + 1)/2;
4974     else
4975         return -(val + 1)/2;
4976 }
4977 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
4978     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
4979         return 0;   /* 8x8 */
4980     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
4981         return 1;   /* 8x4 */
4982     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
4983         return 2;   /* 4x8 */
4984     return 3;       /* 4x4 */
4985 }
4986 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
4987     int type;
4988     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
4989         return 0;   /* B_Direct_8x8 */
4990     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
4991         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
4992     type = 3;
4993     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
4994         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
4995             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
4996         type += 4;
4997     }
4998     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
4999     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5000     return type;
5001 }
5002
5003 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5004     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5005 }
5006
5007 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5008     int refa = h->ref_cache[list][scan8[n] - 1];
5009     int refb = h->ref_cache[list][scan8[n] - 8];
5010     int ref  = 0;
5011     int ctx  = 0;
5012
5013     if( h->slice_type_nos == FF_B_TYPE) {
5014         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5015             ctx++;
5016         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5017             ctx += 2;
5018     } else {
5019         if( refa > 0 )
5020             ctx++;
5021         if( refb > 0 )
5022             ctx += 2;
5023     }
5024
5025     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5026         ref++;
5027         if( ctx < 4 )
5028             ctx = 4;
5029         else
5030             ctx = 5;
5031         if(ref >= 32 /*h->ref_list[list]*/){
5032             return -1;
5033         }
5034     }
5035     return ref;
5036 }
5037
5038 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5039     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5040                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5041     int ctxbase = (l == 0) ? 40 : 47;
5042     int mvd;
5043     int ctx = (amvd>2) + (amvd>32);
5044
5045     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5046         return 0;
5047
5048     mvd= 1;
5049     ctx= 3;
5050     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5051         mvd++;
5052         if( ctx < 6 )
5053             ctx++;
5054     }
5055
5056     if( mvd >= 9 ) {
5057         int k = 3;
5058         while( get_cabac_bypass( &h->cabac ) ) {
5059             mvd += 1 << k;
5060             k++;
5061             if(k>24){
5062                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5063                 return INT_MIN;
5064             }
5065         }
5066         while( k-- ) {
5067             if( get_cabac_bypass( &h->cabac ) )
5068                 mvd += 1 << k;
5069         }
5070     }
5071     return get_cabac_bypass_sign( &h->cabac, -mvd );
5072 }
5073
5074 static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
5075     int nza, nzb;
5076     int ctx = 0;
5077
5078     if( is_dc ) {
5079         if( cat == 0 ) {
5080             nza = h->left_cbp&0x100;
5081             nzb = h-> top_cbp&0x100;
5082         } else {
5083             nza = (h->left_cbp>>(6+idx))&0x01;
5084             nzb = (h-> top_cbp>>(6+idx))&0x01;
5085         }
5086     } else {
5087         assert(cat == 1 || cat == 2 || cat == 4);
5088         nza = h->non_zero_count_cache[scan8[idx] - 1];
5089         nzb = h->non_zero_count_cache[scan8[idx] - 8];
5090     }
5091
5092     if( nza > 0 )
5093         ctx++;
5094
5095     if( nzb > 0 )
5096         ctx += 2;
5097
5098     return ctx + 4 * cat;
5099 }
5100
5101 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8[63]) = {
5102     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5103     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5104     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5105     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5106 };
5107
5108 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
5109     static const int significant_coeff_flag_offset[2][6] = {
5110       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5111       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5112     };
5113     static const int last_coeff_flag_offset[2][6] = {
5114       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5115       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5116     };
5117     static const int coeff_abs_level_m1_offset[6] = {
5118         227+0, 227+10, 227+20, 227+30, 227+39, 426
5119     };
5120     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5121       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5122         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5123         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5124        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5125       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5126         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5127         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5128         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5129     };
5130     /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
5131      * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
5132      * map node ctx => cabac ctx for level=1 */
5133     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
5134     /* map node ctx => cabac ctx for level>1 */
5135     static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
5136     static const uint8_t coeff_abs_level_transition[2][8] = {
5137     /* update node ctx after decoding a level=1 */
5138         { 1, 2, 3, 3, 4, 5, 6, 7 },
5139     /* update node ctx after decoding a level>1 */
5140         { 4, 4, 4, 4, 5, 6, 7, 7 }
5141     };
5142
5143     int index[64];
5144
5145     int av_unused last;
5146     int coeff_count = 0;
5147     int node_ctx = 0;
5148
5149     uint8_t *significant_coeff_ctx_base;
5150     uint8_t *last_coeff_ctx_base;
5151     uint8_t *abs_level_m1_ctx_base;
5152
5153 #ifndef ARCH_X86
5154 #define CABAC_ON_STACK
5155 #endif
5156 #ifdef CABAC_ON_STACK
5157 #define CC &cc
5158     CABACContext cc;
5159     cc.range     = h->cabac.range;
5160     cc.low       = h->cabac.low;
5161     cc.bytestream= h->cabac.bytestream;
5162 #else
5163 #define CC &h->cabac
5164 #endif
5165
5166
5167     /* cat: 0-> DC 16x16  n = 0
5168      *      1-> AC 16x16  n = luma4x4idx
5169      *      2-> Luma4x4   n = luma4x4idx
5170      *      3-> DC Chroma n = iCbCr
5171      *      4-> AC Chroma n = 16 + 4 * iCbCr + chroma4x4idx
5172      *      5-> Luma8x8   n = 4 * luma8x8idx
5173      */
5174
5175     /* read coded block flag */
5176     if( is_dc || cat != 5 ) {
5177         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
5178             if( !is_dc )
5179                 h->non_zero_count_cache[scan8[n]] = 0;
5180
5181 #ifdef CABAC_ON_STACK
5182             h->cabac.range     = cc.range     ;
5183             h->cabac.low       = cc.low       ;
5184             h->cabac.bytestream= cc.bytestream;
5185 #endif
5186             return;
5187         }
5188     }
5189
5190     significant_coeff_ctx_base = h->cabac_state
5191         + significant_coeff_flag_offset[MB_FIELD][cat];
5192     last_coeff_ctx_base = h->cabac_state
5193         + last_coeff_flag_offset[MB_FIELD][cat];
5194     abs_level_m1_ctx_base = h->cabac_state
5195         + coeff_abs_level_m1_offset[cat];
5196
5197     if( !is_dc && cat == 5 ) {
5198 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5199         for(last= 0; last < coefs; last++) { \
5200             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5201             if( get_cabac( CC, sig_ctx )) { \
5202                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5203                 index[coeff_count++] = last; \
5204                 if( get_cabac( CC, last_ctx ) ) { \
5205                     last= max_coeff; \
5206                     break; \
5207                 } \
5208             } \
5209         }\
5210         if( last == max_coeff -1 ) {\
5211             index[coeff_count++] = last;\
5212         }
5213         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5214 #if defined(ARCH_X86) && defined(HAVE_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
5215         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5216     } else {
5217         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5218 #else
5219         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5220     } else {
5221         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5222 #endif
5223     }
5224     assert(coeff_count > 0);
5225
5226     if( is_dc ) {
5227         if( cat == 0 )
5228             h->cbp_table[h->mb_xy] |= 0x100;
5229         else
5230             h->cbp_table[h->mb_xy] |= 0x40 << n;
5231     } else {
5232         if( cat == 5 )
5233             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5234         else {
5235             assert( cat == 1 || cat == 2 || cat == 4 );
5236             h->non_zero_count_cache[scan8[n]] = coeff_count;
5237         }
5238     }
5239
5240     do {
5241         uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
5242
5243         int j= scantable[index[--coeff_count]];
5244
5245         if( get_cabac( CC, ctx ) == 0 ) {
5246             node_ctx = coeff_abs_level_transition[0][node_ctx];
5247             if( is_dc ) {
5248                 block[j] = get_cabac_bypass_sign( CC, -1);
5249             }else{
5250                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5251             }
5252         } else {
5253             int coeff_abs = 2;
5254             ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
5255             node_ctx = coeff_abs_level_transition[1][node_ctx];
5256
5257             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5258                 coeff_abs++;
5259             }
5260
5261             if( coeff_abs >= 15 ) {
5262                 int j = 0;
5263                 while( get_cabac_bypass( CC ) ) {
5264                     j++;
5265                 }
5266
5267                 coeff_abs=1;
5268                 while( j-- ) {
5269                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5270                 }
5271                 coeff_abs+= 14;
5272             }
5273
5274             if( is_dc ) {
5275                 block[j] = get_cabac_bypass_sign( CC, -coeff_abs );
5276             }else{
5277                 block[j] = (get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32) >> 6;
5278             }
5279         }
5280     } while( coeff_count );
5281 #ifdef CABAC_ON_STACK
5282             h->cabac.range     = cc.range     ;
5283             h->cabac.low       = cc.low       ;
5284             h->cabac.bytestream= cc.bytestream;
5285 #endif
5286
5287 }
5288
5289 #ifndef CONFIG_SMALL
5290 static void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5291     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 1);
5292 }
5293
5294 static void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5295     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
5296 }
5297 #endif
5298
5299 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5300 #ifdef CONFIG_SMALL
5301     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, cat == 0 || cat == 3);
5302 #else
5303     if( cat == 0 || cat == 3 ) decode_cabac_residual_dc(h, block, cat, n, scantable, qmul, max_coeff);
5304     else decode_cabac_residual_nondc(h, block, cat, n, scantable, qmul, max_coeff);
5305 #endif
5306 }
5307
5308 static inline void compute_mb_neighbors(H264Context *h)
5309 {
5310     MpegEncContext * const s = &h->s;
5311     const int mb_xy  = h->mb_xy;
5312     h->top_mb_xy     = mb_xy - s->mb_stride;
5313     h->left_mb_xy[0] = mb_xy - 1;
5314     if(FRAME_MBAFF){
5315         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5316         const int top_pair_xy      = pair_xy     - s->mb_stride;
5317         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5318         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5319         const int curr_mb_frame_flag = !MB_FIELD;
5320         const int bottom = (s->mb_y & 1);
5321         if (bottom
5322                 ? !curr_mb_frame_flag // bottom macroblock
5323                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
5324                 ) {
5325             h->top_mb_xy -= s->mb_stride;
5326         }
5327         if (left_mb_frame_flag != curr_mb_frame_flag) {
5328             h->left_mb_xy[0] = pair_xy - 1;
5329         }
5330     } else if (FIELD_PICTURE) {
5331         h->top_mb_xy -= s->mb_stride;
5332     }
5333     return;
5334 }
5335
5336 /**
5337  * decodes a macroblock
5338  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5339  */
5340 static int decode_mb_cabac(H264Context *h) {
5341     MpegEncContext * const s = &h->s;
5342     int mb_xy;
5343     int mb_type, partition_count, cbp = 0;
5344     int dct8x8_allowed= h->pps.transform_8x8_mode;
5345
5346     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5347
5348     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
5349
5350     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5351     if( h->slice_type_nos != FF_I_TYPE ) {
5352         int skip;
5353         /* a skipped mb needs the aff flag from the following mb */
5354         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5355             predict_field_decoding_flag(h);
5356         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5357             skip = h->next_mb_skipped;
5358         else
5359             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5360         /* read skip flags */
5361         if( skip ) {
5362             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5363                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5364                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5365                 if(h->next_mb_skipped)
5366                     predict_field_decoding_flag(h);
5367                 else
5368                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5369             }
5370
5371             decode_mb_skip(h);
5372
5373             h->cbp_table[mb_xy] = 0;
5374             h->chroma_pred_mode_table[mb_xy] = 0;
5375             h->last_qscale_diff = 0;
5376
5377             return 0;
5378
5379         }
5380     }
5381     if(FRAME_MBAFF){
5382         if( (s->mb_y&1) == 0 )
5383             h->mb_mbaff =
5384             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5385     }
5386
5387     h->prev_mb_skipped = 0;
5388
5389     compute_mb_neighbors(h);
5390     mb_type = decode_cabac_mb_type( h );
5391     assert(mb_type >= 0);
5392
5393     if( h->slice_type_nos == FF_B_TYPE ) {
5394         if( mb_type < 23 ){
5395             partition_count= b_mb_type_info[mb_type].partition_count;
5396             mb_type=         b_mb_type_info[mb_type].type;
5397         }else{
5398             mb_type -= 23;
5399             goto decode_intra_mb;
5400         }
5401     } else if( h->slice_type_nos == FF_P_TYPE ) {
5402         if( mb_type < 5) {
5403             partition_count= p_mb_type_info[mb_type].partition_count;
5404             mb_type=         p_mb_type_info[mb_type].type;
5405         } else {
5406             mb_type -= 5;
5407             goto decode_intra_mb;
5408         }
5409     } else {
5410         if(h->slice_type == FF_SI_TYPE && mb_type)
5411             mb_type--;
5412         assert(h->slice_type_nos == FF_I_TYPE);
5413 decode_intra_mb:
5414         partition_count = 0;
5415         cbp= i_mb_type_info[mb_type].cbp;
5416         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5417         mb_type= i_mb_type_info[mb_type].type;
5418     }
5419     if(MB_FIELD)
5420         mb_type |= MB_TYPE_INTERLACED;
5421
5422     h->slice_table[ mb_xy ]= h->slice_num;
5423
5424     if(IS_INTRA_PCM(mb_type)) {
5425         const uint8_t *ptr;
5426
5427         // We assume these blocks are very rare so we do not optimize it.
5428         // FIXME The two following lines get the bitstream position in the cabac
5429         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5430         ptr= h->cabac.bytestream;
5431         if(h->cabac.low&0x1) ptr--;
5432         if(CABAC_BITS==16){
5433             if(h->cabac.low&0x1FF) ptr--;
5434         }
5435
5436         // The pixels are stored in the same order as levels in h->mb array.
5437         memcpy(h->mb, ptr, 256); ptr+=256;
5438         if(CHROMA){
5439             memcpy(h->mb+128, ptr, 128); ptr+=128;
5440         }
5441
5442         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5443
5444         // All blocks are present
5445         h->cbp_table[mb_xy] = 0x1ef;
5446         h->chroma_pred_mode_table[mb_xy] = 0;
5447         // In deblocking, the quantizer is 0
5448         s->current_picture.qscale_table[mb_xy]= 0;
5449         // All coeffs are present
5450         memset(h->non_zero_count[mb_xy], 16, 16);
5451         s->current_picture.mb_type[mb_xy]= mb_type;
5452         h->last_qscale_diff = 0;
5453         return 0;
5454     }
5455
5456     if(MB_MBAFF){
5457         h->ref_count[0] <<= 1;
5458         h->ref_count[1] <<= 1;
5459     }
5460
5461     fill_caches(h, mb_type, 0);
5462
5463     if( IS_INTRA( mb_type ) ) {
5464         int i, pred_mode;
5465         if( IS_INTRA4x4( mb_type ) ) {
5466             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5467                 mb_type |= MB_TYPE_8x8DCT;
5468                 for( i = 0; i < 16; i+=4 ) {
5469                     int pred = pred_intra_mode( h, i );
5470                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5471                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5472                 }
5473             } else {
5474                 for( i = 0; i < 16; i++ ) {
5475                     int pred = pred_intra_mode( h, i );
5476                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5477
5478                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5479                 }
5480             }
5481             write_back_intra_pred_mode(h);
5482             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5483         } else {
5484             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5485             if( h->intra16x16_pred_mode < 0 ) return -1;
5486         }
5487         if(CHROMA){
5488             h->chroma_pred_mode_table[mb_xy] =
5489             pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5490
5491             pred_mode= check_intra_pred_mode( h, pred_mode );
5492             if( pred_mode < 0 ) return -1;
5493             h->chroma_pred_mode= pred_mode;
5494         }
5495     } else if( partition_count == 4 ) {
5496         int i, j, sub_partition_count[4], list, ref[2][4];
5497
5498         if( h->slice_type_nos == FF_B_TYPE ) {
5499             for( i = 0; i < 4; i++ ) {
5500                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5501                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5502                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5503             }
5504             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5505                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5506                 pred_direct_motion(h, &mb_type);
5507                 h->ref_cache[0][scan8[4]] =
5508                 h->ref_cache[1][scan8[4]] =
5509                 h->ref_cache[0][scan8[12]] =
5510                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5511                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5512                     for( i = 0; i < 4; i++ )
5513                         if( IS_DIRECT(h->sub_mb_type[i]) )
5514                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5515                 }
5516             }
5517         } else {
5518             for( i = 0; i < 4; i++ ) {
5519                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5520                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5521                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5522             }
5523         }
5524
5525         for( list = 0; list < h->list_count; list++ ) {
5526                 for( i = 0; i < 4; i++ ) {
5527                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5528                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5529                         if( h->ref_count[list] > 1 ){
5530                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5531                             if(ref[list][i] >= (unsigned)h->ref_count[list]){
5532                                 av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref[list][i], h->ref_count[list]);
5533                                 return -1;
5534                             }
5535                         }else
5536                             ref[list][i] = 0;
5537                     } else {
5538                         ref[list][i] = -1;
5539                     }
5540                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5541                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5542                 }
5543         }
5544
5545         if(dct8x8_allowed)
5546             dct8x8_allowed = get_dct8x8_allowed(h);
5547
5548         for(list=0; list<h->list_count; list++){
5549             for(i=0; i<4; i++){
5550                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5551                 if(IS_DIRECT(h->sub_mb_type[i])){
5552                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5553                     continue;
5554                 }
5555
5556                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5557                     const int sub_mb_type= h->sub_mb_type[i];
5558                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5559                     for(j=0; j<sub_partition_count[i]; j++){
5560                         int mpx, mpy;
5561                         int mx, my;
5562                         const int index= 4*i + block_width*j;
5563                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5564                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5565                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5566
5567                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5568                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5569                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5570
5571                         if(IS_SUB_8X8(sub_mb_type)){
5572                             mv_cache[ 1 ][0]=
5573                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5574                             mv_cache[ 1 ][1]=
5575                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5576
5577                             mvd_cache[ 1 ][0]=
5578                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5579                             mvd_cache[ 1 ][1]=
5580                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5581                         }else if(IS_SUB_8X4(sub_mb_type)){
5582                             mv_cache[ 1 ][0]= mx;
5583                             mv_cache[ 1 ][1]= my;
5584
5585                             mvd_cache[ 1 ][0]= mx - mpx;
5586                             mvd_cache[ 1 ][1]= my - mpy;
5587                         }else if(IS_SUB_4X8(sub_mb_type)){
5588                             mv_cache[ 8 ][0]= mx;
5589                             mv_cache[ 8 ][1]= my;
5590
5591                             mvd_cache[ 8 ][0]= mx - mpx;
5592                             mvd_cache[ 8 ][1]= my - mpy;
5593                         }
5594                         mv_cache[ 0 ][0]= mx;
5595                         mv_cache[ 0 ][1]= my;
5596
5597                         mvd_cache[ 0 ][0]= mx - mpx;
5598                         mvd_cache[ 0 ][1]= my - mpy;
5599                     }
5600                 }else{
5601                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5602                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5603                     p[0] = p[1] = p[8] = p[9] = 0;
5604                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5605                 }
5606             }
5607         }
5608     } else if( IS_DIRECT(mb_type) ) {
5609         pred_direct_motion(h, &mb_type);
5610         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5611         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5612         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5613     } else {
5614         int list, mx, my, i, mpx, mpy;
5615         if(IS_16X16(mb_type)){
5616             for(list=0; list<h->list_count; list++){
5617                 if(IS_DIR(mb_type, 0, list)){
5618                     int ref;
5619                     if(h->ref_count[list] > 1){
5620                         ref= decode_cabac_mb_ref(h, list, 0);
5621                         if(ref >= (unsigned)h->ref_count[list]){
5622                             av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5623                             return -1;
5624                         }
5625                     }else
5626                         ref=0;
5627                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5628                 }else
5629                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5630             }
5631             for(list=0; list<h->list_count; list++){
5632                 if(IS_DIR(mb_type, 0, list)){
5633                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5634
5635                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5636                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5637                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5638
5639                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5640                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5641                 }else
5642                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5643             }
5644         }
5645         else if(IS_16X8(mb_type)){
5646             for(list=0; list<h->list_count; list++){
5647                     for(i=0; i<2; i++){
5648                         if(IS_DIR(mb_type, i, list)){
5649                             int ref;
5650                             if(h->ref_count[list] > 1){
5651                                 ref= decode_cabac_mb_ref( h, list, 8*i );
5652                                 if(ref >= (unsigned)h->ref_count[list]){
5653                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5654                                     return -1;
5655                                 }
5656                             }else
5657                                 ref=0;
5658                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5659                         }else
5660                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5661                     }
5662             }
5663             for(list=0; list<h->list_count; list++){
5664                 for(i=0; i<2; i++){
5665                     if(IS_DIR(mb_type, i, list)){
5666                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5667                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5668                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5669                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5670
5671                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5672                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5673                     }else{
5674                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5675                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5676                     }
5677                 }
5678             }
5679         }else{
5680             assert(IS_8X16(mb_type));
5681             for(list=0; list<h->list_count; list++){
5682                     for(i=0; i<2; i++){
5683                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5684                             int ref;
5685                             if(h->ref_count[list] > 1){
5686                                 ref= decode_cabac_mb_ref( h, list, 4*i );
5687                                 if(ref >= (unsigned)h->ref_count[list]){
5688                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5689                                     return -1;
5690                                 }
5691                             }else
5692                                 ref=0;
5693                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5694                         }else
5695                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5696                     }
5697             }
5698             for(list=0; list<h->list_count; list++){
5699                 for(i=0; i<2; i++){
5700                     if(IS_DIR(mb_type, i, list)){
5701                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5702                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5703                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5704
5705                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5706                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5707                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5708                     }else{
5709                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5710                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5711                     }
5712                 }
5713             }
5714         }
5715     }
5716
5717    if( IS_INTER( mb_type ) ) {
5718         h->chroma_pred_mode_table[mb_xy] = 0;
5719         write_back_motion( h, mb_type );
5720    }
5721
5722     if( !IS_INTRA16x16( mb_type ) ) {
5723         cbp  = decode_cabac_mb_cbp_luma( h );
5724         if(CHROMA)
5725             cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5726     }
5727
5728     h->cbp_table[mb_xy] = h->cbp = cbp;
5729
5730     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5731         if( decode_cabac_mb_transform_size( h ) )
5732             mb_type |= MB_TYPE_8x8DCT;
5733     }
5734     s->current_picture.mb_type[mb_xy]= mb_type;
5735
5736     if( cbp || IS_INTRA16x16( mb_type ) ) {
5737         const uint8_t *scan, *scan8x8, *dc_scan;
5738         const uint32_t *qmul;
5739         int dqp;
5740
5741         if(IS_INTERLACED(mb_type)){
5742             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5743             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5744             dc_scan= luma_dc_field_scan;
5745         }else{
5746             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5747             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5748             dc_scan= luma_dc_zigzag_scan;
5749         }
5750
5751         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5752         if( dqp == INT_MIN ){
5753             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5754             return -1;
5755         }
5756         s->qscale += dqp;
5757         if(((unsigned)s->qscale) > 51){
5758             if(s->qscale<0) s->qscale+= 52;
5759             else            s->qscale-= 52;
5760         }
5761         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5762         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5763
5764         if( IS_INTRA16x16( mb_type ) ) {
5765             int i;
5766             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5767             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5768
5769             if( cbp&15 ) {
5770                 qmul = h->dequant4_coeff[0][s->qscale];
5771                 for( i = 0; i < 16; i++ ) {
5772                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5773                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
5774                 }
5775             } else {
5776                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5777             }
5778         } else {
5779             int i8x8, i4x4;
5780             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5781                 if( cbp & (1<<i8x8) ) {
5782                     if( IS_8x8DCT(mb_type) ) {
5783                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
5784                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
5785                     } else {
5786                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
5787                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
5788                             const int index = 4*i8x8 + i4x4;
5789                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
5790 //START_TIMER
5791                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
5792 //STOP_TIMER("decode_residual")
5793                         }
5794                     }
5795                 } else {
5796                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5797                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5798                 }
5799             }
5800         }
5801
5802         if( cbp&0x30 ){
5803             int c;
5804             for( c = 0; c < 2; c++ ) {
5805                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
5806                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
5807             }
5808         }
5809
5810         if( cbp&0x20 ) {
5811             int c, i;
5812             for( c = 0; c < 2; c++ ) {
5813                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
5814                 for( i = 0; i < 4; i++ ) {
5815                     const int index = 16 + 4 * c + i;
5816                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
5817                     decode_cabac_residual(h, h->mb + 16*index, 4, index, scan + 1, qmul, 15);
5818                 }
5819             }
5820         } else {
5821             uint8_t * const nnz= &h->non_zero_count_cache[0];
5822             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5823             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5824         }
5825     } else {
5826         uint8_t * const nnz= &h->non_zero_count_cache[0];
5827         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5828         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5829         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5830         h->last_qscale_diff = 0;
5831     }
5832
5833     s->current_picture.qscale_table[mb_xy]= s->qscale;
5834     write_back_non_zero_count(h);
5835
5836     if(MB_MBAFF){
5837         h->ref_count[0] >>= 1;
5838         h->ref_count[1] >>= 1;
5839     }
5840
5841     return 0;
5842 }
5843
5844
5845 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5846     int i, d;
5847     const int index_a = qp + h->slice_alpha_c0_offset;
5848     const int alpha = (alpha_table+52)[index_a];
5849     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5850
5851     if( bS[0] < 4 ) {
5852         int8_t tc[4];
5853         for(i=0; i<4; i++)
5854             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
5855         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
5856     } else {
5857         h->s.dsp.h264_h_loop_filter_luma_intra(pix, stride, alpha, beta);
5858     }
5859 }
5860 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5861     int i;
5862     const int index_a = qp + h->slice_alpha_c0_offset;
5863     const int alpha = (alpha_table+52)[index_a];
5864     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5865
5866     if( bS[0] < 4 ) {
5867         int8_t tc[4];
5868         for(i=0; i<4; i++)
5869             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
5870         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
5871     } else {
5872         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
5873     }
5874 }
5875
5876 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
5877     int i;
5878     for( i = 0; i < 16; i++, pix += stride) {
5879         int index_a;
5880         int alpha;
5881         int beta;
5882
5883         int qp_index;
5884         int bS_index = (i >> 1);
5885         if (!MB_FIELD) {
5886             bS_index &= ~1;
5887             bS_index |= (i & 1);
5888         }
5889
5890         if( bS[bS_index] == 0 ) {
5891             continue;
5892         }
5893
5894         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
5895         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
5896         alpha = (alpha_table+52)[index_a];
5897         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
5898
5899         if( bS[bS_index] < 4 ) {
5900             const int tc0 = (tc0_table+52)[index_a][bS[bS_index] - 1];
5901             const int p0 = pix[-1];
5902             const int p1 = pix[-2];
5903             const int p2 = pix[-3];
5904             const int q0 = pix[0];
5905             const int q1 = pix[1];
5906             const int q2 = pix[2];
5907
5908             if( FFABS( p0 - q0 ) < alpha &&
5909                 FFABS( p1 - p0 ) < beta &&
5910                 FFABS( q1 - q0 ) < beta ) {
5911                 int tc = tc0;
5912                 int i_delta;
5913
5914                 if( FFABS( p2 - p0 ) < beta ) {
5915                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
5916                     tc++;
5917                 }
5918                 if( FFABS( q2 - q0 ) < beta ) {
5919                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
5920                     tc++;
5921                 }
5922
5923                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
5924                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
5925                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
5926                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
5927             }
5928         }else{
5929             const int p0 = pix[-1];
5930             const int p1 = pix[-2];
5931             const int p2 = pix[-3];
5932
5933             const int q0 = pix[0];
5934             const int q1 = pix[1];
5935             const int q2 = pix[2];
5936
5937             if( FFABS( p0 - q0 ) < alpha &&
5938                 FFABS( p1 - p0 ) < beta &&
5939                 FFABS( q1 - q0 ) < beta ) {
5940
5941                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
5942                     if( FFABS( p2 - p0 ) < beta)
5943                     {
5944                         const int p3 = pix[-4];
5945                         /* p0', p1', p2' */
5946                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
5947                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
5948                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
5949                     } else {
5950                         /* p0' */
5951                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5952                     }
5953                     if( FFABS( q2 - q0 ) < beta)
5954                     {
5955                         const int q3 = pix[3];
5956                         /* q0', q1', q2' */
5957                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
5958                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
5959                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
5960                     } else {
5961                         /* q0' */
5962                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5963                     }
5964                 }else{
5965                     /* p0', q0' */
5966                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5967                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5968                 }
5969                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
5970             }
5971         }
5972     }
5973 }
5974 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
5975     int i;
5976     for( i = 0; i < 8; i++, pix += stride) {
5977         int index_a;
5978         int alpha;
5979         int beta;
5980
5981         int qp_index;
5982         int bS_index = i;
5983
5984         if( bS[bS_index] == 0 ) {
5985             continue;
5986         }
5987
5988         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
5989         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
5990         alpha = (alpha_table+52)[index_a];
5991         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
5992
5993         if( bS[bS_index] < 4 ) {
5994             const int tc = (tc0_table+52)[index_a][bS[bS_index] - 1] + 1;
5995             const int p0 = pix[-1];
5996             const int p1 = pix[-2];
5997             const int q0 = pix[0];
5998             const int q1 = pix[1];
5999
6000             if( FFABS( p0 - q0 ) < alpha &&
6001                 FFABS( p1 - p0 ) < beta &&
6002                 FFABS( q1 - q0 ) < beta ) {
6003                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6004
6005                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6006                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6007                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6008             }
6009         }else{
6010             const int p0 = pix[-1];
6011             const int p1 = pix[-2];
6012             const int q0 = pix[0];
6013             const int q1 = pix[1];
6014
6015             if( FFABS( p0 - q0 ) < alpha &&
6016                 FFABS( p1 - p0 ) < beta &&
6017                 FFABS( q1 - q0 ) < beta ) {
6018
6019                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6020                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6021                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6022             }
6023         }
6024     }
6025 }
6026
6027 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6028     int i, d;
6029     const int index_a = qp + h->slice_alpha_c0_offset;
6030     const int alpha = (alpha_table+52)[index_a];
6031     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6032     const int pix_next  = stride;
6033
6034     if( bS[0] < 4 ) {
6035         int8_t tc[4];
6036         for(i=0; i<4; i++)
6037             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6038         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6039     } else {
6040         h->s.dsp.h264_v_loop_filter_luma_intra(pix, stride, alpha, beta);
6041     }
6042 }
6043
6044 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6045     int i;
6046     const int index_a = qp + h->slice_alpha_c0_offset;
6047     const int alpha = (alpha_table+52)[index_a];
6048     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6049
6050     if( bS[0] < 4 ) {
6051         int8_t tc[4];
6052         for(i=0; i<4; i++)
6053             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6054         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6055     } else {
6056         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6057     }
6058 }
6059
6060 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6061     MpegEncContext * const s = &h->s;
6062     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6063     int mb_xy, mb_type;
6064     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6065
6066     mb_xy = h->mb_xy;
6067
6068     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6069         !(s->flags2 & CODEC_FLAG2_FAST) || //FIXME filter_mb_fast is broken, thus hasto be, but should not under CODEC_FLAG2_FAST
6070        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6071                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6072         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6073         return;
6074     }
6075     assert(!FRAME_MBAFF);
6076
6077     mb_type = s->current_picture.mb_type[mb_xy];
6078     qp = s->current_picture.qscale_table[mb_xy];
6079     qp0 = s->current_picture.qscale_table[mb_xy-1];
6080     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6081     qpc = get_chroma_qp( h, 0, qp );
6082     qpc0 = get_chroma_qp( h, 0, qp0 );
6083     qpc1 = get_chroma_qp( h, 0, qp1 );
6084     qp0 = (qp + qp0 + 1) >> 1;
6085     qp1 = (qp + qp1 + 1) >> 1;
6086     qpc0 = (qpc + qpc0 + 1) >> 1;
6087     qpc1 = (qpc + qpc1 + 1) >> 1;
6088     qp_thresh = 15 - h->slice_alpha_c0_offset;
6089     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6090        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6091         return;
6092
6093     if( IS_INTRA(mb_type) ) {
6094         int16_t bS4[4] = {4,4,4,4};
6095         int16_t bS3[4] = {3,3,3,3};
6096         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6097         if( IS_8x8DCT(mb_type) ) {
6098             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6099             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6100             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6101             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6102         } else {
6103             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6104             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6105             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6106             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6107             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6108             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6109             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6110             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6111         }
6112         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6113         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6114         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6115         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6116         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6117         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6118         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6119         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6120         return;
6121     } else {
6122         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6123         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6124         int edges;
6125         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6126             edges = 4;
6127             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6128         } else {
6129             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6130                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6131             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6132                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6133                              ? 3 : 0;
6134             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6135             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6136             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6137                                               (h->slice_type_nos == FF_B_TYPE), edges, step, mask_edge0, mask_edge1, FIELD_PICTURE);
6138         }
6139         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6140             bSv[0][0] = 0x0004000400040004ULL;
6141         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6142             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6143
6144 #define FILTER(hv,dir,edge)\
6145         if(bSv[dir][edge]) {\
6146             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6147             if(!(edge&1)) {\
6148                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6149                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6150             }\
6151         }
6152         if( edges == 1 ) {
6153             FILTER(v,0,0);
6154             FILTER(h,1,0);
6155         } else if( IS_8x8DCT(mb_type) ) {
6156             FILTER(v,0,0);
6157             FILTER(v,0,2);
6158             FILTER(h,1,0);
6159             FILTER(h,1,2);
6160         } else {
6161             FILTER(v,0,0);
6162             FILTER(v,0,1);
6163             FILTER(v,0,2);
6164             FILTER(v,0,3);
6165             FILTER(h,1,0);
6166             FILTER(h,1,1);
6167             FILTER(h,1,2);
6168             FILTER(h,1,3);
6169         }
6170 #undef FILTER
6171     }
6172 }
6173
6174
6175 static void av_always_inline filter_mb_dir(H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int mb_xy, int mb_type, int mvy_limit, int first_vertical_edge_done, int dir) {
6176     MpegEncContext * const s = &h->s;
6177     int edge;
6178     const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6179     const int mbm_type = s->current_picture.mb_type[mbm_xy];
6180     int (*ref2frm) [64] = h->ref2frm[ h->slice_num          &(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6181     int (*ref2frmm)[64] = h->ref2frm[ h->slice_table[mbm_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6182     int start = h->slice_table[mbm_xy] == 0xFFFF ? 1 : 0;
6183
6184     const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6185                               == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6186     // how often to recheck mv-based bS when iterating between edges
6187     const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6188                           (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6189     // how often to recheck mv-based bS when iterating along each edge
6190     const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6191
6192     if (first_vertical_edge_done) {
6193         start = 1;
6194     }
6195
6196     if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6197         start = 1;
6198
6199     if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6200         && !IS_INTERLACED(mb_type)
6201         && IS_INTERLACED(mbm_type)
6202         ) {
6203         // This is a special case in the norm where the filtering must
6204         // be done twice (one each of the field) even if we are in a
6205         // frame macroblock.
6206         //
6207         static const int nnz_idx[4] = {4,5,6,3};
6208         unsigned int tmp_linesize   = 2 *   linesize;
6209         unsigned int tmp_uvlinesize = 2 * uvlinesize;
6210         int mbn_xy = mb_xy - 2 * s->mb_stride;
6211         int qp;
6212         int i, j;
6213         int16_t bS[4];
6214
6215         for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6216             if( IS_INTRA(mb_type) ||
6217                 IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6218                 bS[0] = bS[1] = bS[2] = bS[3] = 3;
6219             } else {
6220                 const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6221                 for( i = 0; i < 4; i++ ) {
6222                     if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6223                         mbn_nnz[nnz_idx[i]] != 0 )
6224                         bS[i] = 2;
6225                     else
6226                         bS[i] = 1;
6227                 }
6228             }
6229             // Do not use s->qscale as luma quantizer because it has not the same
6230             // value in IPCM macroblocks.
6231             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6232             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6233             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6234             filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6235             filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6236                               ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6237             filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6238                               ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6239         }
6240
6241         start = 1;
6242     }
6243
6244     /* Calculate bS */
6245     for( edge = start; edge < edges; edge++ ) {
6246         /* mbn_xy: neighbor macroblock */
6247         const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6248         const int mbn_type = s->current_picture.mb_type[mbn_xy];
6249         int (*ref2frmn)[64] = edge > 0 ? ref2frm : ref2frmm;
6250         int16_t bS[4];
6251         int qp;
6252
6253         if( (edge&1) && IS_8x8DCT(mb_type) )
6254             continue;
6255
6256         if( IS_INTRA(mb_type) ||
6257             IS_INTRA(mbn_type) ) {
6258             int value;
6259             if (edge == 0) {
6260                 if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6261                     || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6262                 ) {
6263                     value = 4;
6264                 } else {
6265                     value = 3;
6266                 }
6267             } else {
6268                 value = 3;
6269             }
6270             bS[0] = bS[1] = bS[2] = bS[3] = value;
6271         } else {
6272             int i, l;
6273             int mv_done;
6274
6275             if( edge & mask_edge ) {
6276                 bS[0] = bS[1] = bS[2] = bS[3] = 0;
6277                 mv_done = 1;
6278             }
6279             else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6280                 bS[0] = bS[1] = bS[2] = bS[3] = 1;
6281                 mv_done = 1;
6282             }
6283             else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6284                 int b_idx= 8 + 4 + edge * (dir ? 8:1);
6285                 int bn_idx= b_idx - (dir ? 8:1);
6286                 int v = 0;
6287
6288                 for( l = 0; !v && l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6289                     v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6290                          FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6291                          FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6292                 }
6293
6294                 if(h->slice_type_nos == FF_B_TYPE && v){
6295                     v=0;
6296                     for( l = 0; !v && l < 2; l++ ) {
6297                         int ln= 1-l;
6298                         v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6299                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6300                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit;
6301                     }
6302                 }
6303
6304                 bS[0] = bS[1] = bS[2] = bS[3] = v;
6305                 mv_done = 1;
6306             }
6307             else
6308                 mv_done = 0;
6309
6310             for( i = 0; i < 4; i++ ) {
6311                 int x = dir == 0 ? edge : i;
6312                 int y = dir == 0 ? i    : edge;
6313                 int b_idx= 8 + 4 + x + 8*y;
6314                 int bn_idx= b_idx - (dir ? 8:1);
6315
6316                 if( h->non_zero_count_cache[b_idx] |
6317                     h->non_zero_count_cache[bn_idx] ) {
6318                     bS[i] = 2;
6319                 }
6320                 else if(!mv_done)
6321                 {
6322                     bS[i] = 0;
6323                     for( l = 0; l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6324                         if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6325                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6326                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6327                             bS[i] = 1;
6328                             break;
6329                         }
6330                     }
6331
6332                     if(h->slice_type_nos == FF_B_TYPE && bS[i]){
6333                         bS[i] = 0;
6334                         for( l = 0; l < 2; l++ ) {
6335                             int ln= 1-l;
6336                             if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6337                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6338                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) {
6339                                 bS[i] = 1;
6340                                 break;
6341                             }
6342                         }
6343                     }
6344                 }
6345             }
6346
6347             if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6348                 continue;
6349         }
6350
6351         /* Filter edge */
6352         // Do not use s->qscale as luma quantizer because it has not the same
6353         // value in IPCM macroblocks.
6354         qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6355         //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6356         tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6357         { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6358         if( dir == 0 ) {
6359             filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6360             if( (edge&1) == 0 ) {
6361                 filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6362                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6363                 filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6364                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6365             }
6366         } else {
6367             filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6368             if( (edge&1) == 0 ) {
6369                 filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6370                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6371                 filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6372                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6373             }
6374         }
6375     }
6376 }
6377
6378 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6379     MpegEncContext * const s = &h->s;
6380     const int mb_xy= mb_x + mb_y*s->mb_stride;
6381     const int mb_type = s->current_picture.mb_type[mb_xy];
6382     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6383     int first_vertical_edge_done = 0;
6384     int dir;
6385
6386     //for sufficiently low qp, filtering wouldn't do anything
6387     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6388     if(!FRAME_MBAFF){
6389         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
6390         int qp = s->current_picture.qscale_table[mb_xy];
6391         if(qp <= qp_thresh
6392            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6393            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6394             return;
6395         }
6396     }
6397
6398     // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
6399     if(!h->pps.cabac && h->pps.transform_8x8_mode){
6400         int top_type, left_type[2];
6401         top_type     = s->current_picture.mb_type[h->top_mb_xy]    ;
6402         left_type[0] = s->current_picture.mb_type[h->left_mb_xy[0]];
6403         left_type[1] = s->current_picture.mb_type[h->left_mb_xy[1]];
6404
6405         if(IS_8x8DCT(top_type)){
6406             h->non_zero_count_cache[4+8*0]=
6407             h->non_zero_count_cache[5+8*0]= h->cbp_table[h->top_mb_xy] & 4;
6408             h->non_zero_count_cache[6+8*0]=
6409             h->non_zero_count_cache[7+8*0]= h->cbp_table[h->top_mb_xy] & 8;
6410         }
6411         if(IS_8x8DCT(left_type[0])){
6412             h->non_zero_count_cache[3+8*1]=
6413             h->non_zero_count_cache[3+8*2]= h->cbp_table[h->left_mb_xy[0]]&2; //FIXME check MBAFF
6414         }
6415         if(IS_8x8DCT(left_type[1])){
6416             h->non_zero_count_cache[3+8*3]=
6417             h->non_zero_count_cache[3+8*4]= h->cbp_table[h->left_mb_xy[1]]&8; //FIXME check MBAFF
6418         }
6419
6420         if(IS_8x8DCT(mb_type)){
6421             h->non_zero_count_cache[scan8[0   ]]= h->non_zero_count_cache[scan8[1   ]]=
6422             h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp & 1;
6423
6424             h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
6425             h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp & 2;
6426
6427             h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
6428             h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp & 4;
6429
6430             h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
6431             h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp & 8;
6432         }
6433     }
6434
6435     if (FRAME_MBAFF
6436             // left mb is in picture
6437             && h->slice_table[mb_xy-1] != 0xFFFF
6438             // and current and left pair do not have the same interlaced type
6439             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6440             // and left mb is in the same slice if deblocking_filter == 2
6441             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6442         /* First vertical edge is different in MBAFF frames
6443          * There are 8 different bS to compute and 2 different Qp
6444          */
6445         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6446         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6447         int16_t bS[8];
6448         int qp[2];
6449         int bqp[2];
6450         int rqp[2];
6451         int mb_qp, mbn0_qp, mbn1_qp;
6452         int i;
6453         first_vertical_edge_done = 1;
6454
6455         if( IS_INTRA(mb_type) )
6456             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6457         else {
6458             for( i = 0; i < 8; i++ ) {
6459                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6460
6461                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6462                     bS[i] = 4;
6463                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6464                          ((!h->pps.cabac && IS_8x8DCT(s->current_picture.mb_type[mbn_xy])) ?
6465                             (h->cbp_table[mbn_xy] & ((MB_FIELD ? (i&2) : (mb_y&1)) ? 8 : 2))
6466                                                                        :
6467                             h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2]))
6468                     bS[i] = 2;
6469                 else
6470                     bS[i] = 1;
6471             }
6472         }
6473
6474         mb_qp = s->current_picture.qscale_table[mb_xy];
6475         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6476         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6477         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6478         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6479                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6480         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6481                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6482         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6483         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6484                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6485         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6486                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6487
6488         /* Filter edge */
6489         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6490         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6491         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6492         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6493         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6494     }
6495
6496 #ifdef CONFIG_SMALL
6497     for( dir = 0; dir < 2; dir++ )
6498         filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, dir ? 0 : first_vertical_edge_done, dir);
6499 #else
6500     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, first_vertical_edge_done, 0);
6501     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, 0, 1);
6502 #endif
6503 }
6504
6505 static int decode_slice(struct AVCodecContext *avctx, void *arg){
6506     H264Context *h = *(void**)arg;
6507     MpegEncContext * const s = &h->s;
6508     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6509
6510     s->mb_skip_run= -1;
6511
6512     h->is_complex = FRAME_MBAFF || s->picture_structure != PICT_FRAME || s->codec_id != CODEC_ID_H264 ||
6513                     (ENABLE_GRAY && (s->flags&CODEC_FLAG_GRAY)) || (ENABLE_H264_ENCODER && s->encoding);
6514
6515     if( h->pps.cabac ) {
6516         int i;
6517
6518         /* realign */
6519         align_get_bits( &s->gb );
6520
6521         /* init cabac */
6522         ff_init_cabac_states( &h->cabac);
6523         ff_init_cabac_decoder( &h->cabac,
6524                                s->gb.buffer + get_bits_count(&s->gb)/8,
6525                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6526         /* calculate pre-state */
6527         for( i= 0; i < 460; i++ ) {
6528             int pre;
6529             if( h->slice_type_nos == FF_I_TYPE )
6530                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6531             else
6532                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6533
6534             if( pre <= 63 )
6535                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6536             else
6537                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6538         }
6539
6540         for(;;){
6541 //START_TIMER
6542             int ret = decode_mb_cabac(h);
6543             int eos;
6544 //STOP_TIMER("decode_mb_cabac")
6545
6546             if(ret>=0) hl_decode_mb(h);
6547
6548             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6549                 s->mb_y++;
6550
6551                 if(ret>=0) ret = decode_mb_cabac(h);
6552
6553                 if(ret>=0) hl_decode_mb(h);
6554                 s->mb_y--;
6555             }
6556             eos = get_cabac_terminate( &h->cabac );
6557
6558             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6559                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6560                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6561                 return -1;
6562             }
6563
6564             if( ++s->mb_x >= s->mb_width ) {
6565                 s->mb_x = 0;
6566                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6567                 ++s->mb_y;
6568                 if(FIELD_OR_MBAFF_PICTURE) {
6569                     ++s->mb_y;
6570                 }
6571             }
6572
6573             if( eos || s->mb_y >= s->mb_height ) {
6574                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6575                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6576                 return 0;
6577             }
6578         }
6579
6580     } else {
6581         for(;;){
6582             int ret = decode_mb_cavlc(h);
6583
6584             if(ret>=0) hl_decode_mb(h);
6585
6586             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6587                 s->mb_y++;
6588                 ret = decode_mb_cavlc(h);
6589
6590                 if(ret>=0) hl_decode_mb(h);
6591                 s->mb_y--;
6592             }
6593
6594             if(ret<0){
6595                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6596                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6597
6598                 return -1;
6599             }
6600
6601             if(++s->mb_x >= s->mb_width){
6602                 s->mb_x=0;
6603                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6604                 ++s->mb_y;
6605                 if(FIELD_OR_MBAFF_PICTURE) {
6606                     ++s->mb_y;
6607                 }
6608                 if(s->mb_y >= s->mb_height){
6609                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6610
6611                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6612                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6613
6614                         return 0;
6615                     }else{
6616                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6617
6618                         return -1;
6619                     }
6620                 }
6621             }
6622
6623             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6624                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6625                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6626                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6627
6628                     return 0;
6629                 }else{
6630                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6631
6632                     return -1;
6633                 }
6634             }
6635         }
6636     }
6637
6638 #if 0
6639     for(;s->mb_y < s->mb_height; s->mb_y++){
6640         for(;s->mb_x < s->mb_width; s->mb_x++){
6641             int ret= decode_mb(h);
6642
6643             hl_decode_mb(h);
6644
6645             if(ret<0){
6646                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6647                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6648
6649                 return -1;
6650             }
6651
6652             if(++s->mb_x >= s->mb_width){
6653                 s->mb_x=0;
6654                 if(++s->mb_y >= s->mb_height){
6655                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6656                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6657
6658                         return 0;
6659                     }else{
6660                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6661
6662                         return -1;
6663                     }
6664                 }
6665             }
6666
6667             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6668                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6669                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6670
6671                     return 0;
6672                 }else{
6673                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6674
6675                     return -1;
6676                 }
6677             }
6678         }
6679         s->mb_x=0;
6680         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6681     }
6682 #endif
6683     return -1; //not reached
6684 }
6685
6686 static int decode_picture_timing(H264Context *h){
6687     MpegEncContext * const s = &h->s;
6688     if(h->sps.nal_hrd_parameters_present_flag || h->sps.vcl_hrd_parameters_present_flag){
6689         skip_bits(&s->gb, h->sps.cpb_removal_delay_length); /* cpb_removal_delay */
6690         skip_bits(&s->gb, h->sps.dpb_output_delay_length);  /* dpb_output_delay */
6691     }
6692     if(h->sps.pic_struct_present_flag){
6693         unsigned int i, num_clock_ts;
6694         h->sei_pic_struct = get_bits(&s->gb, 4);
6695
6696         if (h->sei_pic_struct > SEI_PIC_STRUCT_FRAME_TRIPLING)
6697             return -1;
6698
6699         num_clock_ts = sei_num_clock_ts_table[h->sei_pic_struct];
6700
6701         for (i = 0 ; i < num_clock_ts ; i++){
6702             if(get_bits(&s->gb, 1)){                  /* clock_timestamp_flag */
6703                 unsigned int full_timestamp_flag;
6704                 skip_bits(&s->gb, 2);                 /* ct_type */
6705                 skip_bits(&s->gb, 1);                 /* nuit_field_based_flag */
6706                 skip_bits(&s->gb, 5);                 /* counting_type */
6707                 full_timestamp_flag = get_bits(&s->gb, 1);
6708                 skip_bits(&s->gb, 1);                 /* discontinuity_flag */
6709                 skip_bits(&s->gb, 1);                 /* cnt_dropped_flag */
6710                 skip_bits(&s->gb, 8);                 /* n_frames */
6711                 if(full_timestamp_flag){
6712                     skip_bits(&s->gb, 6);             /* seconds_value 0..59 */
6713                     skip_bits(&s->gb, 6);             /* minutes_value 0..59 */
6714                     skip_bits(&s->gb, 5);             /* hours_value 0..23 */
6715                 }else{
6716                     if(get_bits(&s->gb, 1)){          /* seconds_flag */
6717                         skip_bits(&s->gb, 6);         /* seconds_value range 0..59 */
6718                         if(get_bits(&s->gb, 1)){      /* minutes_flag */
6719                             skip_bits(&s->gb, 6);     /* minutes_value 0..59 */
6720                             if(get_bits(&s->gb, 1))   /* hours_flag */
6721                                 skip_bits(&s->gb, 5); /* hours_value 0..23 */
6722                         }
6723                     }
6724                 }
6725                 if(h->sps.time_offset_length > 0)
6726                     skip_bits(&s->gb, h->sps.time_offset_length); /* time_offset */
6727             }
6728         }
6729     }
6730     return 0;
6731 }
6732
6733 static int decode_unregistered_user_data(H264Context *h, int size){
6734     MpegEncContext * const s = &h->s;
6735     uint8_t user_data[16+256];
6736     int e, build, i;
6737
6738     if(size<16)
6739         return -1;
6740
6741     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6742         user_data[i]= get_bits(&s->gb, 8);
6743     }
6744
6745     user_data[i]= 0;
6746     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6747     if(e==1 && build>=0)
6748         h->x264_build= build;
6749
6750     if(s->avctx->debug & FF_DEBUG_BUGS)
6751         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6752
6753     for(; i<size; i++)
6754         skip_bits(&s->gb, 8);
6755
6756     return 0;
6757 }
6758
6759 static int decode_sei(H264Context *h){
6760     MpegEncContext * const s = &h->s;
6761
6762     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6763         int size, type;
6764
6765         type=0;
6766         do{
6767             type+= show_bits(&s->gb, 8);
6768         }while(get_bits(&s->gb, 8) == 255);
6769
6770         size=0;
6771         do{
6772             size+= show_bits(&s->gb, 8);
6773         }while(get_bits(&s->gb, 8) == 255);
6774
6775         switch(type){
6776         case 1: // Picture timing SEI
6777             if(decode_picture_timing(h) < 0)
6778                 return -1;
6779             break;
6780         case 5:
6781             if(decode_unregistered_user_data(h, size) < 0)
6782                 return -1;
6783             break;
6784         default:
6785             skip_bits(&s->gb, 8*size);
6786         }
6787
6788         //FIXME check bits here
6789         align_get_bits(&s->gb);
6790     }
6791
6792     return 0;
6793 }
6794
6795 static inline int decode_hrd_parameters(H264Context *h, SPS *sps){
6796     MpegEncContext * const s = &h->s;
6797     int cpb_count, i;
6798     cpb_count = get_ue_golomb(&s->gb) + 1;
6799
6800     if(cpb_count > 32U){
6801         av_log(h->s.avctx, AV_LOG_ERROR, "cpb_count %d invalid\n", cpb_count);
6802         return -1;
6803     }
6804
6805     get_bits(&s->gb, 4); /* bit_rate_scale */
6806     get_bits(&s->gb, 4); /* cpb_size_scale */
6807     for(i=0; i<cpb_count; i++){
6808         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
6809         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
6810         get_bits1(&s->gb);     /* cbr_flag */
6811     }
6812     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
6813     sps->cpb_removal_delay_length = get_bits(&s->gb, 5) + 1;
6814     sps->dpb_output_delay_length = get_bits(&s->gb, 5) + 1;
6815     sps->time_offset_length = get_bits(&s->gb, 5);
6816     return 0;
6817 }
6818
6819 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
6820     MpegEncContext * const s = &h->s;
6821     int aspect_ratio_info_present_flag;
6822     unsigned int aspect_ratio_idc;
6823
6824     aspect_ratio_info_present_flag= get_bits1(&s->gb);
6825
6826     if( aspect_ratio_info_present_flag ) {
6827         aspect_ratio_idc= get_bits(&s->gb, 8);
6828         if( aspect_ratio_idc == EXTENDED_SAR ) {
6829             sps->sar.num= get_bits(&s->gb, 16);
6830             sps->sar.den= get_bits(&s->gb, 16);
6831         }else if(aspect_ratio_idc < FF_ARRAY_ELEMS(pixel_aspect)){
6832             sps->sar=  pixel_aspect[aspect_ratio_idc];
6833         }else{
6834             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
6835             return -1;
6836         }
6837     }else{
6838         sps->sar.num=
6839         sps->sar.den= 0;
6840     }
6841 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
6842
6843     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
6844         get_bits1(&s->gb);      /* overscan_appropriate_flag */
6845     }
6846
6847     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
6848         get_bits(&s->gb, 3);    /* video_format */
6849         get_bits1(&s->gb);      /* video_full_range_flag */
6850         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
6851             get_bits(&s->gb, 8); /* colour_primaries */
6852             get_bits(&s->gb, 8); /* transfer_characteristics */
6853             get_bits(&s->gb, 8); /* matrix_coefficients */
6854         }
6855     }
6856
6857     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
6858         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
6859         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
6860     }
6861
6862     sps->timing_info_present_flag = get_bits1(&s->gb);
6863     if(sps->timing_info_present_flag){
6864         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
6865         sps->time_scale = get_bits_long(&s->gb, 32);
6866         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
6867     }
6868
6869     sps->nal_hrd_parameters_present_flag = get_bits1(&s->gb);
6870     if(sps->nal_hrd_parameters_present_flag)
6871         if(decode_hrd_parameters(h, sps) < 0)
6872             return -1;
6873     sps->vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
6874     if(sps->vcl_hrd_parameters_present_flag)
6875         if(decode_hrd_parameters(h, sps) < 0)
6876             return -1;
6877     if(sps->nal_hrd_parameters_present_flag || sps->vcl_hrd_parameters_present_flag)
6878         get_bits1(&s->gb);     /* low_delay_hrd_flag */
6879     sps->pic_struct_present_flag = get_bits1(&s->gb);
6880
6881     sps->bitstream_restriction_flag = get_bits1(&s->gb);
6882     if(sps->bitstream_restriction_flag){
6883         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
6884         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
6885         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
6886         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
6887         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
6888         sps->num_reorder_frames= get_ue_golomb(&s->gb);
6889         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
6890
6891         if(sps->num_reorder_frames > 16U /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
6892             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", sps->num_reorder_frames);
6893             return -1;
6894         }
6895     }
6896
6897     return 0;
6898 }
6899
6900 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
6901                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
6902     MpegEncContext * const s = &h->s;
6903     int i, last = 8, next = 8;
6904     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
6905     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
6906         memcpy(factors, fallback_list, size*sizeof(uint8_t));
6907     else
6908     for(i=0;i<size;i++){
6909         if(next)
6910             next = (last + get_se_golomb(&s->gb)) & 0xff;
6911         if(!i && !next){ /* matrix not written, we use the preset one */
6912             memcpy(factors, jvt_list, size*sizeof(uint8_t));
6913             break;
6914         }
6915         last = factors[scan[i]] = next ? next : last;
6916     }
6917 }
6918
6919 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
6920                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
6921     MpegEncContext * const s = &h->s;
6922     int fallback_sps = !is_sps && sps->scaling_matrix_present;
6923     const uint8_t *fallback[4] = {
6924         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
6925         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
6926         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
6927         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
6928     };
6929     if(get_bits1(&s->gb)){
6930         sps->scaling_matrix_present |= is_sps;
6931         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
6932         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
6933         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
6934         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
6935         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
6936         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
6937         if(is_sps || pps->transform_8x8_mode){
6938             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
6939             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
6940         }
6941     }
6942 }
6943
6944 static inline int decode_seq_parameter_set(H264Context *h){
6945     MpegEncContext * const s = &h->s;
6946     int profile_idc, level_idc;
6947     unsigned int sps_id;
6948     int i;
6949     SPS *sps;
6950
6951     profile_idc= get_bits(&s->gb, 8);
6952     get_bits1(&s->gb);   //constraint_set0_flag
6953     get_bits1(&s->gb);   //constraint_set1_flag
6954     get_bits1(&s->gb);   //constraint_set2_flag
6955     get_bits1(&s->gb);   //constraint_set3_flag
6956     get_bits(&s->gb, 4); // reserved
6957     level_idc= get_bits(&s->gb, 8);
6958     sps_id= get_ue_golomb(&s->gb);
6959
6960     if(sps_id >= MAX_SPS_COUNT) {
6961         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id (%d) out of range\n", sps_id);
6962         return -1;
6963     }
6964     sps= av_mallocz(sizeof(SPS));
6965     if(sps == NULL)
6966         return -1;
6967
6968     sps->profile_idc= profile_idc;
6969     sps->level_idc= level_idc;
6970
6971     memset(sps->scaling_matrix4, 16, sizeof(sps->scaling_matrix4));
6972     memset(sps->scaling_matrix8, 16, sizeof(sps->scaling_matrix8));
6973     sps->scaling_matrix_present = 0;
6974
6975     if(sps->profile_idc >= 100){ //high profile
6976         sps->chroma_format_idc= get_ue_golomb(&s->gb);
6977         if(sps->chroma_format_idc == 3)
6978             get_bits1(&s->gb);  //residual_color_transform_flag
6979         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
6980         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
6981         sps->transform_bypass = get_bits1(&s->gb);
6982         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
6983     }else{
6984         sps->chroma_format_idc= 1;
6985     }
6986
6987     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
6988     sps->poc_type= get_ue_golomb(&s->gb);
6989
6990     if(sps->poc_type == 0){ //FIXME #define
6991         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
6992     } else if(sps->poc_type == 1){//FIXME #define
6993         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
6994         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
6995         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
6996         sps->poc_cycle_length                = get_ue_golomb(&s->gb);
6997
6998         if((unsigned)sps->poc_cycle_length >= FF_ARRAY_ELEMS(sps->offset_for_ref_frame)){
6999             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", sps->poc_cycle_length);
7000             goto fail;
7001         }
7002
7003         for(i=0; i<sps->poc_cycle_length; i++)
7004             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7005     }else if(sps->poc_type != 2){
7006         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7007         goto fail;
7008     }
7009
7010     sps->ref_frame_count= get_ue_golomb(&s->gb);
7011     if(sps->ref_frame_count > MAX_PICTURE_COUNT-2 || sps->ref_frame_count >= 32U){
7012         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7013         goto fail;
7014     }
7015     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7016     sps->mb_width = get_ue_golomb(&s->gb) + 1;
7017     sps->mb_height= get_ue_golomb(&s->gb) + 1;
7018     if((unsigned)sps->mb_width >= INT_MAX/16 || (unsigned)sps->mb_height >= INT_MAX/16 ||
7019        avcodec_check_dimensions(NULL, 16*sps->mb_width, 16*sps->mb_height)){
7020         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7021         goto fail;
7022     }
7023
7024     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7025     if(!sps->frame_mbs_only_flag)
7026         sps->mb_aff= get_bits1(&s->gb);
7027     else
7028         sps->mb_aff= 0;
7029
7030     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7031
7032 #ifndef ALLOW_INTERLACE
7033     if(sps->mb_aff)
7034         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7035 #endif
7036     sps->crop= get_bits1(&s->gb);
7037     if(sps->crop){
7038         sps->crop_left  = get_ue_golomb(&s->gb);
7039         sps->crop_right = get_ue_golomb(&s->gb);
7040         sps->crop_top   = get_ue_golomb(&s->gb);
7041         sps->crop_bottom= get_ue_golomb(&s->gb);
7042         if(sps->crop_left || sps->crop_top){
7043             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7044         }
7045         if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !sps->frame_mbs_only_flag)){
7046             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
7047         }
7048     }else{
7049         sps->crop_left  =
7050         sps->crop_right =
7051         sps->crop_top   =
7052         sps->crop_bottom= 0;
7053     }
7054
7055     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7056     if( sps->vui_parameters_present_flag )
7057         decode_vui_parameters(h, sps);
7058
7059     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7060         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s %s\n",
7061                sps_id, sps->profile_idc, sps->level_idc,
7062                sps->poc_type,
7063                sps->ref_frame_count,
7064                sps->mb_width, sps->mb_height,
7065                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7066                sps->direct_8x8_inference_flag ? "8B8" : "",
7067                sps->crop_left, sps->crop_right,
7068                sps->crop_top, sps->crop_bottom,
7069                sps->vui_parameters_present_flag ? "VUI" : "",
7070                ((const char*[]){"Gray","420","422","444"})[sps->chroma_format_idc]
7071                );
7072     }
7073     av_free(h->sps_buffers[sps_id]);
7074     h->sps_buffers[sps_id]= sps;
7075     return 0;
7076 fail:
7077     av_free(sps);
7078     return -1;
7079 }
7080
7081 static void
7082 build_qp_table(PPS *pps, int t, int index)
7083 {
7084     int i;
7085     for(i = 0; i < 52; i++)
7086         pps->chroma_qp_table[t][i] = chroma_qp[av_clip(i + index, 0, 51)];
7087 }
7088
7089 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7090     MpegEncContext * const s = &h->s;
7091     unsigned int pps_id= get_ue_golomb(&s->gb);
7092     PPS *pps;
7093
7094     if(pps_id >= MAX_PPS_COUNT) {
7095         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id (%d) out of range\n", pps_id);
7096         return -1;
7097     }
7098
7099     pps= av_mallocz(sizeof(PPS));
7100     if(pps == NULL)
7101         return -1;
7102     pps->sps_id= get_ue_golomb(&s->gb);
7103     if((unsigned)pps->sps_id>=MAX_SPS_COUNT || h->sps_buffers[pps->sps_id] == NULL){
7104         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7105         goto fail;
7106     }
7107
7108     pps->cabac= get_bits1(&s->gb);
7109     pps->pic_order_present= get_bits1(&s->gb);
7110     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7111     if(pps->slice_group_count > 1 ){
7112         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7113         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7114         switch(pps->mb_slice_group_map_type){
7115         case 0:
7116 #if 0
7117 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7118 |    run_length[ i ]                                |1  |ue(v)   |
7119 #endif
7120             break;
7121         case 2:
7122 #if 0
7123 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7124 |{                                                  |   |        |
7125 |    top_left_mb[ i ]                               |1  |ue(v)   |
7126 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7127 |   }                                               |   |        |
7128 #endif
7129             break;
7130         case 3:
7131         case 4:
7132         case 5:
7133 #if 0
7134 |   slice_group_change_direction_flag               |1  |u(1)    |
7135 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7136 #endif
7137             break;
7138         case 6:
7139 #if 0
7140 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7141 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7142 |)                                                  |   |        |
7143 |    slice_group_id[ i ]                            |1  |u(v)    |
7144 #endif
7145             break;
7146         }
7147     }
7148     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7149     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7150     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7151         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7152         goto fail;
7153     }
7154
7155     pps->weighted_pred= get_bits1(&s->gb);
7156     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7157     pps->init_qp= get_se_golomb(&s->gb) + 26;
7158     pps->init_qs= get_se_golomb(&s->gb) + 26;
7159     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7160     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7161     pps->constrained_intra_pred= get_bits1(&s->gb);
7162     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7163
7164     pps->transform_8x8_mode= 0;
7165     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7166     memcpy(pps->scaling_matrix4, h->sps_buffers[pps->sps_id]->scaling_matrix4, sizeof(pps->scaling_matrix4));
7167     memcpy(pps->scaling_matrix8, h->sps_buffers[pps->sps_id]->scaling_matrix8, sizeof(pps->scaling_matrix8));
7168
7169     if(get_bits_count(&s->gb) < bit_length){
7170         pps->transform_8x8_mode= get_bits1(&s->gb);
7171         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7172         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7173     } else {
7174         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7175     }
7176
7177     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7178     build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7179     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1])
7180         h->pps.chroma_qp_diff= 1;
7181
7182     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7183         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7184                pps_id, pps->sps_id,
7185                pps->cabac ? "CABAC" : "CAVLC",
7186                pps->slice_group_count,
7187                pps->ref_count[0], pps->ref_count[1],
7188                pps->weighted_pred ? "weighted" : "",
7189                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7190                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7191                pps->constrained_intra_pred ? "CONSTR" : "",
7192                pps->redundant_pic_cnt_present ? "REDU" : "",
7193                pps->transform_8x8_mode ? "8x8DCT" : ""
7194                );
7195     }
7196
7197     av_free(h->pps_buffers[pps_id]);
7198     h->pps_buffers[pps_id]= pps;
7199     return 0;
7200 fail:
7201     av_free(pps);
7202     return -1;
7203 }
7204
7205 /**
7206  * Call decode_slice() for each context.
7207  *
7208  * @param h h264 master context
7209  * @param context_count number of contexts to execute
7210  */
7211 static void execute_decode_slices(H264Context *h, int context_count){
7212     MpegEncContext * const s = &h->s;
7213     AVCodecContext * const avctx= s->avctx;
7214     H264Context *hx;
7215     int i;
7216
7217     if(context_count == 1) {
7218         decode_slice(avctx, &h);
7219     } else {
7220         for(i = 1; i < context_count; i++) {
7221             hx = h->thread_context[i];
7222             hx->s.error_recognition = avctx->error_recognition;
7223             hx->s.error_count = 0;
7224         }
7225
7226         avctx->execute(avctx, (void *)decode_slice,
7227                        (void **)h->thread_context, NULL, context_count, sizeof(void*));
7228
7229         /* pull back stuff from slices to master context */
7230         hx = h->thread_context[context_count - 1];
7231         s->mb_x = hx->s.mb_x;
7232         s->mb_y = hx->s.mb_y;
7233         s->dropable = hx->s.dropable;
7234         s->picture_structure = hx->s.picture_structure;
7235         for(i = 1; i < context_count; i++)
7236             h->s.error_count += h->thread_context[i]->s.error_count;
7237     }
7238 }
7239
7240
7241 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7242     MpegEncContext * const s = &h->s;
7243     AVCodecContext * const avctx= s->avctx;
7244     int buf_index=0;
7245     H264Context *hx; ///< thread context
7246     int context_count = 0;
7247
7248     h->max_contexts = avctx->thread_count;
7249 #if 0
7250     int i;
7251     for(i=0; i<50; i++){
7252         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7253     }
7254 #endif
7255     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7256         h->current_slice = 0;
7257         if (!s->first_field)
7258             s->current_picture_ptr= NULL;
7259     }
7260
7261     for(;;){
7262         int consumed;
7263         int dst_length;
7264         int bit_length;
7265         const uint8_t *ptr;
7266         int i, nalsize = 0;
7267         int err;
7268
7269         if(h->is_avc) {
7270             if(buf_index >= buf_size) break;
7271             nalsize = 0;
7272             for(i = 0; i < h->nal_length_size; i++)
7273                 nalsize = (nalsize << 8) | buf[buf_index++];
7274             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7275                 if(nalsize == 1){
7276                     buf_index++;
7277                     continue;
7278                 }else{
7279                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7280                     break;
7281                 }
7282             }
7283         } else {
7284             // start code prefix search
7285             for(; buf_index + 3 < buf_size; buf_index++){
7286                 // This should always succeed in the first iteration.
7287                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7288                     break;
7289             }
7290
7291             if(buf_index+3 >= buf_size) break;
7292
7293             buf_index+=3;
7294         }
7295
7296         hx = h->thread_context[context_count];
7297
7298         ptr= decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7299         if (ptr==NULL || dst_length < 0){
7300             return -1;
7301         }
7302         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7303             dst_length--;
7304         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
7305
7306         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7307             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7308         }
7309
7310         if (h->is_avc && (nalsize != consumed)){
7311             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7312             consumed= nalsize;
7313         }
7314
7315         buf_index += consumed;
7316
7317         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7318            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7319             continue;
7320
7321       again:
7322         err = 0;
7323         switch(hx->nal_unit_type){
7324         case NAL_IDR_SLICE:
7325             if (h->nal_unit_type != NAL_IDR_SLICE) {
7326                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7327                 return -1;
7328             }
7329             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7330         case NAL_SLICE:
7331             init_get_bits(&hx->s.gb, ptr, bit_length);
7332             hx->intra_gb_ptr=
7333             hx->inter_gb_ptr= &hx->s.gb;
7334             hx->s.data_partitioning = 0;
7335
7336             if((err = decode_slice_header(hx, h)))
7337                break;
7338
7339             s->current_picture_ptr->key_frame|= (hx->nal_unit_type == NAL_IDR_SLICE);
7340             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7341                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7342                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7343                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7344                && avctx->skip_frame < AVDISCARD_ALL)
7345                 context_count++;
7346             break;
7347         case NAL_DPA:
7348             init_get_bits(&hx->s.gb, ptr, bit_length);
7349             hx->intra_gb_ptr=
7350             hx->inter_gb_ptr= NULL;
7351             hx->s.data_partitioning = 1;
7352
7353             err = decode_slice_header(hx, h);
7354             break;
7355         case NAL_DPB:
7356             init_get_bits(&hx->intra_gb, ptr, bit_length);
7357             hx->intra_gb_ptr= &hx->intra_gb;
7358             break;
7359         case NAL_DPC:
7360             init_get_bits(&hx->inter_gb, ptr, bit_length);
7361             hx->inter_gb_ptr= &hx->inter_gb;
7362
7363             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7364                && s->context_initialized
7365                && s->hurry_up < 5
7366                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7367                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7368                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7369                && avctx->skip_frame < AVDISCARD_ALL)
7370                 context_count++;
7371             break;
7372         case NAL_SEI:
7373             init_get_bits(&s->gb, ptr, bit_length);
7374             decode_sei(h);
7375             break;
7376         case NAL_SPS:
7377             init_get_bits(&s->gb, ptr, bit_length);
7378             decode_seq_parameter_set(h);
7379
7380             if(s->flags& CODEC_FLAG_LOW_DELAY)
7381                 s->low_delay=1;
7382
7383             if(avctx->has_b_frames < 2)
7384                 avctx->has_b_frames= !s->low_delay;
7385             break;
7386         case NAL_PPS:
7387             init_get_bits(&s->gb, ptr, bit_length);
7388
7389             decode_picture_parameter_set(h, bit_length);
7390
7391             break;
7392         case NAL_AUD:
7393         case NAL_END_SEQUENCE:
7394         case NAL_END_STREAM:
7395         case NAL_FILLER_DATA:
7396         case NAL_SPS_EXT:
7397         case NAL_AUXILIARY_SLICE:
7398             break;
7399         default:
7400             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7401         }
7402
7403         if(context_count == h->max_contexts) {
7404             execute_decode_slices(h, context_count);
7405             context_count = 0;
7406         }
7407
7408         if (err < 0)
7409             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7410         else if(err == 1) {
7411             /* Slice could not be decoded in parallel mode, copy down
7412              * NAL unit stuff to context 0 and restart. Note that
7413              * rbsp_buffer is not transferred, but since we no longer
7414              * run in parallel mode this should not be an issue. */
7415             h->nal_unit_type = hx->nal_unit_type;
7416             h->nal_ref_idc   = hx->nal_ref_idc;
7417             hx = h;
7418             goto again;
7419         }
7420     }
7421     if(context_count)
7422         execute_decode_slices(h, context_count);
7423     return buf_index;
7424 }
7425
7426 /**
7427  * returns the number of bytes consumed for building the current frame
7428  */
7429 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7430         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7431         if(pos+10>buf_size) pos=buf_size; // oops ;)
7432
7433         return pos;
7434 }
7435
7436 static int decode_frame(AVCodecContext *avctx,
7437                              void *data, int *data_size,
7438                              const uint8_t *buf, int buf_size)
7439 {
7440     H264Context *h = avctx->priv_data;
7441     MpegEncContext *s = &h->s;
7442     AVFrame *pict = data;
7443     int buf_index;
7444
7445     s->flags= avctx->flags;
7446     s->flags2= avctx->flags2;
7447
7448    /* end of stream, output what is still in the buffers */
7449     if (buf_size == 0) {
7450         Picture *out;
7451         int i, out_idx;
7452
7453 //FIXME factorize this with the output code below
7454         out = h->delayed_pic[0];
7455         out_idx = 0;
7456         for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7457             if(h->delayed_pic[i]->poc < out->poc){
7458                 out = h->delayed_pic[i];
7459                 out_idx = i;
7460             }
7461
7462         for(i=out_idx; h->delayed_pic[i]; i++)
7463             h->delayed_pic[i] = h->delayed_pic[i+1];
7464
7465         if(out){
7466             *data_size = sizeof(AVFrame);
7467             *pict= *(AVFrame*)out;
7468         }
7469
7470         return 0;
7471     }
7472
7473     if(h->is_avc && !h->got_avcC) {
7474         int i, cnt, nalsize;
7475         unsigned char *p = avctx->extradata;
7476         if(avctx->extradata_size < 7) {
7477             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7478             return -1;
7479         }
7480         if(*p != 1) {
7481             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7482             return -1;
7483         }
7484         /* sps and pps in the avcC always have length coded with 2 bytes,
7485            so put a fake nal_length_size = 2 while parsing them */
7486         h->nal_length_size = 2;
7487         // Decode sps from avcC
7488         cnt = *(p+5) & 0x1f; // Number of sps
7489         p += 6;
7490         for (i = 0; i < cnt; i++) {
7491             nalsize = AV_RB16(p) + 2;
7492             if(decode_nal_units(h, p, nalsize) < 0) {
7493                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7494                 return -1;
7495             }
7496             p += nalsize;
7497         }
7498         // Decode pps from avcC
7499         cnt = *(p++); // Number of pps
7500         for (i = 0; i < cnt; i++) {
7501             nalsize = AV_RB16(p) + 2;
7502             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7503                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7504                 return -1;
7505             }
7506             p += nalsize;
7507         }
7508         // Now store right nal length size, that will be use to parse all other nals
7509         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7510         // Do not reparse avcC
7511         h->got_avcC = 1;
7512     }
7513
7514     if(!h->got_avcC && !h->is_avc && s->avctx->extradata_size){
7515         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7516             return -1;
7517         h->got_avcC = 1;
7518     }
7519
7520     buf_index=decode_nal_units(h, buf, buf_size);
7521     if(buf_index < 0)
7522         return -1;
7523
7524     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7525         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7526         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7527         return -1;
7528     }
7529
7530     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7531         Picture *out = s->current_picture_ptr;
7532         Picture *cur = s->current_picture_ptr;
7533         int i, pics, cross_idr, out_of_order, out_idx;
7534
7535         s->mb_y= 0;
7536
7537         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7538         s->current_picture_ptr->pict_type= s->pict_type;
7539
7540         if(!s->dropable) {
7541             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7542             h->prev_poc_msb= h->poc_msb;
7543             h->prev_poc_lsb= h->poc_lsb;
7544         }
7545         h->prev_frame_num_offset= h->frame_num_offset;
7546         h->prev_frame_num= h->frame_num;
7547
7548         /*
7549          * FIXME: Error handling code does not seem to support interlaced
7550          * when slices span multiple rows
7551          * The ff_er_add_slice calls don't work right for bottom
7552          * fields; they cause massive erroneous error concealing
7553          * Error marking covers both fields (top and bottom).
7554          * This causes a mismatched s->error_count
7555          * and a bad error table. Further, the error count goes to
7556          * INT_MAX when called for bottom field, because mb_y is
7557          * past end by one (callers fault) and resync_mb_y != 0
7558          * causes problems for the first MB line, too.
7559          */
7560         if (!FIELD_PICTURE)
7561             ff_er_frame_end(s);
7562
7563         MPV_frame_end(s);
7564
7565         if (cur->field_poc[0]==INT_MAX || cur->field_poc[1]==INT_MAX) {
7566             /* Wait for second field. */
7567             *data_size = 0;
7568
7569         } else {
7570             cur->repeat_pict = 0;
7571
7572             /* Signal interlacing information externally. */
7573             /* Prioritize picture timing SEI information over used decoding process if it exists. */
7574             if(h->sps.pic_struct_present_flag){
7575                 switch (h->sei_pic_struct)
7576                 {
7577                 case SEI_PIC_STRUCT_FRAME:
7578                     cur->interlaced_frame = 0;
7579                     break;
7580                 case SEI_PIC_STRUCT_TOP_FIELD:
7581                 case SEI_PIC_STRUCT_BOTTOM_FIELD:
7582                 case SEI_PIC_STRUCT_TOP_BOTTOM:
7583                 case SEI_PIC_STRUCT_BOTTOM_TOP:
7584                     cur->interlaced_frame = 1;
7585                     break;
7586                 case SEI_PIC_STRUCT_TOP_BOTTOM_TOP:
7587                 case SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM:
7588                     // Signal the possibility of telecined film externally (pic_struct 5,6)
7589                     // From these hints, let the applications decide if they apply deinterlacing.
7590                     cur->repeat_pict = 1;
7591                     cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7592                     break;
7593                 case SEI_PIC_STRUCT_FRAME_DOUBLING:
7594                     // Force progressive here, as doubling interlaced frame is a bad idea.
7595                     cur->interlaced_frame = 0;
7596                     cur->repeat_pict = 2;
7597                     break;
7598                 case SEI_PIC_STRUCT_FRAME_TRIPLING:
7599                     cur->interlaced_frame = 0;
7600                     cur->repeat_pict = 4;
7601                     break;
7602                 }
7603             }else{
7604                 /* Derive interlacing flag from used decoding process. */
7605                 cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7606             }
7607
7608             if (cur->field_poc[0] != cur->field_poc[1]){
7609                 /* Derive top_field_first from field pocs. */
7610                 cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7611             }else{
7612                 if(cur->interlaced_frame || h->sps.pic_struct_present_flag){
7613                     /* Use picture timing SEI information. Even if it is a information of a past frame, better than nothing. */
7614                     if(h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM
7615                       || h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM_TOP)
7616                         cur->top_field_first = 1;
7617                     else
7618                         cur->top_field_first = 0;
7619                 }else{
7620                     /* Most likely progressive */
7621                     cur->top_field_first = 0;
7622                 }
7623             }
7624
7625         //FIXME do something with unavailable reference frames
7626
7627             /* Sort B-frames into display order */
7628
7629             if(h->sps.bitstream_restriction_flag
7630                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7631                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7632                 s->low_delay = 0;
7633             }
7634
7635             if(   s->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT
7636                && !h->sps.bitstream_restriction_flag){
7637                 s->avctx->has_b_frames= MAX_DELAYED_PIC_COUNT;
7638                 s->low_delay= 0;
7639             }
7640
7641             pics = 0;
7642             while(h->delayed_pic[pics]) pics++;
7643
7644             assert(pics <= MAX_DELAYED_PIC_COUNT);
7645
7646             h->delayed_pic[pics++] = cur;
7647             if(cur->reference == 0)
7648                 cur->reference = DELAYED_PIC_REF;
7649
7650             out = h->delayed_pic[0];
7651             out_idx = 0;
7652             for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7653                 if(h->delayed_pic[i]->poc < out->poc){
7654                     out = h->delayed_pic[i];
7655                     out_idx = i;
7656                 }
7657             cross_idr = !h->delayed_pic[0]->poc || !!h->delayed_pic[i] || h->delayed_pic[0]->key_frame;
7658
7659             out_of_order = !cross_idr && out->poc < h->outputed_poc;
7660
7661             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7662                 { }
7663             else if((out_of_order && pics-1 == s->avctx->has_b_frames && s->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT)
7664                || (s->low_delay &&
7665                 ((!cross_idr && out->poc > h->outputed_poc + 2)
7666                  || cur->pict_type == FF_B_TYPE)))
7667             {
7668                 s->low_delay = 0;
7669                 s->avctx->has_b_frames++;
7670             }
7671
7672             if(out_of_order || pics > s->avctx->has_b_frames){
7673                 out->reference &= ~DELAYED_PIC_REF;
7674                 for(i=out_idx; h->delayed_pic[i]; i++)
7675                     h->delayed_pic[i] = h->delayed_pic[i+1];
7676             }
7677             if(!out_of_order && pics > s->avctx->has_b_frames){
7678                 *data_size = sizeof(AVFrame);
7679
7680                 h->outputed_poc = out->poc;
7681                 *pict= *(AVFrame*)out;
7682             }else{
7683                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7684             }
7685         }
7686     }
7687
7688     assert(pict->data[0] || !*data_size);
7689     ff_print_debug_info(s, pict);
7690 //printf("out %d\n", (int)pict->data[0]);
7691 #if 0 //?
7692
7693     /* Return the Picture timestamp as the frame number */
7694     /* we subtract 1 because it is added on utils.c     */
7695     avctx->frame_number = s->picture_number - 1;
7696 #endif
7697     return get_consumed_bytes(s, buf_index, buf_size);
7698 }
7699 #if 0
7700 static inline void fill_mb_avail(H264Context *h){
7701     MpegEncContext * const s = &h->s;
7702     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7703
7704     if(s->mb_y){
7705         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7706         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7707         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7708     }else{
7709         h->mb_avail[0]=
7710         h->mb_avail[1]=
7711         h->mb_avail[2]= 0;
7712     }
7713     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7714     h->mb_avail[4]= 1; //FIXME move out
7715     h->mb_avail[5]= 0; //FIXME move out
7716 }
7717 #endif
7718
7719 #ifdef TEST
7720 #undef printf
7721 #undef random
7722 #define COUNT 8000
7723 #define SIZE (COUNT*40)
7724 int main(void){
7725     int i;
7726     uint8_t temp[SIZE];
7727     PutBitContext pb;
7728     GetBitContext gb;
7729 //    int int_temp[10000];
7730     DSPContext dsp;
7731     AVCodecContext avctx;
7732
7733     dsputil_init(&dsp, &avctx);
7734
7735     init_put_bits(&pb, temp, SIZE);
7736     printf("testing unsigned exp golomb\n");
7737     for(i=0; i<COUNT; i++){
7738         START_TIMER
7739         set_ue_golomb(&pb, i);
7740         STOP_TIMER("set_ue_golomb");
7741     }
7742     flush_put_bits(&pb);
7743
7744     init_get_bits(&gb, temp, 8*SIZE);
7745     for(i=0; i<COUNT; i++){
7746         int j, s;
7747
7748         s= show_bits(&gb, 24);
7749
7750         START_TIMER
7751         j= get_ue_golomb(&gb);
7752         if(j != i){
7753             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7754 //            return -1;
7755         }
7756         STOP_TIMER("get_ue_golomb");
7757     }
7758
7759
7760     init_put_bits(&pb, temp, SIZE);
7761     printf("testing signed exp golomb\n");
7762     for(i=0; i<COUNT; i++){
7763         START_TIMER
7764         set_se_golomb(&pb, i - COUNT/2);
7765         STOP_TIMER("set_se_golomb");
7766     }
7767     flush_put_bits(&pb);
7768
7769     init_get_bits(&gb, temp, 8*SIZE);
7770     for(i=0; i<COUNT; i++){
7771         int j, s;
7772
7773         s= show_bits(&gb, 24);
7774
7775         START_TIMER
7776         j= get_se_golomb(&gb);
7777         if(j != i - COUNT/2){
7778             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7779 //            return -1;
7780         }
7781         STOP_TIMER("get_se_golomb");
7782     }
7783
7784 #if 0
7785     printf("testing 4x4 (I)DCT\n");
7786
7787     DCTELEM block[16];
7788     uint8_t src[16], ref[16];
7789     uint64_t error= 0, max_error=0;
7790
7791     for(i=0; i<COUNT; i++){
7792         int j;
7793 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
7794         for(j=0; j<16; j++){
7795             ref[j]= random()%255;
7796             src[j]= random()%255;
7797         }
7798
7799         h264_diff_dct_c(block, src, ref, 4);
7800
7801         //normalize
7802         for(j=0; j<16; j++){
7803 //            printf("%d ", block[j]);
7804             block[j]= block[j]*4;
7805             if(j&1) block[j]= (block[j]*4 + 2)/5;
7806             if(j&4) block[j]= (block[j]*4 + 2)/5;
7807         }
7808 //        printf("\n");
7809
7810         s->dsp.h264_idct_add(ref, block, 4);
7811 /*        for(j=0; j<16; j++){
7812             printf("%d ", ref[j]);
7813         }
7814         printf("\n");*/
7815
7816         for(j=0; j<16; j++){
7817             int diff= FFABS(src[j] - ref[j]);
7818
7819             error+= diff*diff;
7820             max_error= FFMAX(max_error, diff);
7821         }
7822     }
7823     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
7824     printf("testing quantizer\n");
7825     for(qp=0; qp<52; qp++){
7826         for(i=0; i<16; i++)
7827             src1_block[i]= src2_block[i]= random()%255;
7828
7829     }
7830     printf("Testing NAL layer\n");
7831
7832     uint8_t bitstream[COUNT];
7833     uint8_t nal[COUNT*2];
7834     H264Context h;
7835     memset(&h, 0, sizeof(H264Context));
7836
7837     for(i=0; i<COUNT; i++){
7838         int zeros= i;
7839         int nal_length;
7840         int consumed;
7841         int out_length;
7842         uint8_t *out;
7843         int j;
7844
7845         for(j=0; j<COUNT; j++){
7846             bitstream[j]= (random() % 255) + 1;
7847         }
7848
7849         for(j=0; j<zeros; j++){
7850             int pos= random() % COUNT;
7851             while(bitstream[pos] == 0){
7852                 pos++;
7853                 pos %= COUNT;
7854             }
7855             bitstream[pos]=0;
7856         }
7857
7858         START_TIMER
7859
7860         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
7861         if(nal_length<0){
7862             printf("encoding failed\n");
7863             return -1;
7864         }
7865
7866         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
7867
7868         STOP_TIMER("NAL")
7869
7870         if(out_length != COUNT){
7871             printf("incorrect length %d %d\n", out_length, COUNT);
7872             return -1;
7873         }
7874
7875         if(consumed != nal_length){
7876             printf("incorrect consumed length %d %d\n", nal_length, consumed);
7877             return -1;
7878         }
7879
7880         if(memcmp(bitstream, out, COUNT)){
7881             printf("mismatch\n");
7882             return -1;
7883         }
7884     }
7885 #endif
7886
7887     printf("Testing RBSP\n");
7888
7889
7890     return 0;
7891 }
7892 #endif /* TEST */
7893
7894
7895 static av_cold int decode_end(AVCodecContext *avctx)
7896 {
7897     H264Context *h = avctx->priv_data;
7898     MpegEncContext *s = &h->s;
7899     int i;
7900
7901     av_freep(&h->rbsp_buffer[0]);
7902     av_freep(&h->rbsp_buffer[1]);
7903     free_tables(h); //FIXME cleanup init stuff perhaps
7904
7905     for(i = 0; i < MAX_SPS_COUNT; i++)
7906         av_freep(h->sps_buffers + i);
7907
7908     for(i = 0; i < MAX_PPS_COUNT; i++)
7909         av_freep(h->pps_buffers + i);
7910
7911     MPV_common_end(s);
7912
7913 //    memset(h, 0, sizeof(H264Context));
7914
7915     return 0;
7916 }
7917
7918
7919 AVCodec h264_decoder = {
7920     "h264",
7921     CODEC_TYPE_VIDEO,
7922     CODEC_ID_H264,
7923     sizeof(H264Context),
7924     decode_init,
7925     NULL,
7926     decode_end,
7927     decode_frame,
7928     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_DELAY,
7929     .flush= flush_dpb,
7930     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
7931 };
7932
7933 #include "svq3.c"