libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264.h"
  32 #include "h264data.h"
  33 #include "h264_parser.h"
  34 #include "golomb.h"
  35 #include "rectangle.h"
  36
  37 #include "cabac.h"
  38 #ifdef ARCH_X86
  39 #include "i386/h264_i386.h"
  40 #endif
  41
  42 //#undef NDEBUG
  43 #include <assert.h>
  44
  45 /**
  46  * Value of Picture.reference when Picture is not a reference picture, but
  47  * is held for delayed output.
  48  */
  49 #define DELAYED_PIC_REF 4
  50
  51 static VLC coeff_token_vlc[4];
  52 static VLC_TYPE coeff_token_vlc_tables[520+332+280+256][2];
  53 static const int coeff_token_vlc_tables_size[4]={520,332,280,256};
  54
  55 static VLC chroma_dc_coeff_token_vlc;
  56 static VLC_TYPE chroma_dc_coeff_token_vlc_table[256][2];
  57 static const int chroma_dc_coeff_token_vlc_table_size = 256;
  58
  59 static VLC total_zeros_vlc[15];
  60 static VLC_TYPE total_zeros_vlc_tables[15][512][2];
  61 static const int total_zeros_vlc_tables_size = 512;
  62
  63 static VLC chroma_dc_total_zeros_vlc[3];
  64 static VLC_TYPE chroma_dc_total_zeros_vlc_tables[3][8][2];
  65 static const int chroma_dc_total_zeros_vlc_tables_size = 8;
  66
  67 static VLC run_vlc[6];
  68 static VLC_TYPE run_vlc_tables[6][8][2];
  69 static const int run_vlc_tables_size = 8;
  70
  71 static VLC run7_vlc;
  72 static VLC_TYPE run7_vlc_table[96][2];
  73 static const int run7_vlc_table_size = 96;
  74
  75 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  76 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  77 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  78 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  79 static Picture * remove_long(H264Context *h, int i, int ref_mask);
  80
  81 static av_always_inline uint32_t pack16to32(int a, int b){
  82 #ifdef WORDS_BIGENDIAN
  83    return (b&0xFFFF) + (a<<16);
  84 #else
  85    return (a&0xFFFF) + (b<<16);
  86 #endif
  87 }
  88
  89 static const uint8_t rem6[52]={
  90 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  91 };
  92
  93 static const uint8_t div6[52]={
  94 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  95 };
  96
  97 static const int left_block_options[4][8]={
  98     {0,1,2,3,7,10,8,11},
  99     {2,2,3,3,8,11,8,11},
 100     {0,0,1,1,7,10,7,10},
 101     {0,2,0,2,7,10,7,10}
 102 };
 103
 104 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 105     MpegEncContext * const s = &h->s;
 106     const int mb_xy= h->mb_xy;
 107     int topleft_xy, top_xy, topright_xy, left_xy[2];
 108     int topleft_type, top_type, topright_type, left_type[2];
 109     const int * left_block;
 110     int topleft_partition= -1;
 111     int i;
 112
 113     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
 114
 115     //FIXME deblocking could skip the intra and nnz parts.
 116     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
 117         return;
 118
 119     /* Wow, what a mess, why didn't they simplify the interlacing & intra
 120      * stuff, I can't imagine that these complex rules are worth it. */
 121
 122     topleft_xy = top_xy - 1;
 123     topright_xy= top_xy + 1;
 124     left_xy[1] = left_xy[0] = mb_xy-1;
 125     left_block = left_block_options[0];
 126     if(FRAME_MBAFF){
 127         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 128         const int top_pair_xy      = pair_xy     - s->mb_stride;
 129         const int topleft_pair_xy  = top_pair_xy - 1;
 130         const int topright_pair_xy = top_pair_xy + 1;
 131         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 132         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 133         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 134         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 135         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 136         const int bottom = (s->mb_y & 1);
 137         tprintf(s->avctx, "fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 138         if (bottom
 139                 ? !curr_mb_frame_flag // bottom macroblock
 140                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 141                 ) {
 142             top_xy -= s->mb_stride;
 143         }
 144         if (bottom
 145                 ? !curr_mb_frame_flag // bottom macroblock
 146                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 147                 ) {
 148             topleft_xy -= s->mb_stride;
 149         } else if(bottom && curr_mb_frame_flag && !left_mb_frame_flag) {
 150             topleft_xy += s->mb_stride;
 151             // take top left mv from the middle of the mb, as opposed to all other modes which use the bottom right partition
 152             topleft_partition = 0;
 153         }
 154         if (bottom
 155                 ? !curr_mb_frame_flag // bottom macroblock
 156                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 157                 ) {
 158             topright_xy -= s->mb_stride;
 159         }
 160         if (left_mb_frame_flag != curr_mb_frame_flag) {
 161             left_xy[1] = left_xy[0] = pair_xy - 1;
 162             if (curr_mb_frame_flag) {
 163                 if (bottom) {
 164                     left_block = left_block_options[1];
 165                 } else {
 166                     left_block= left_block_options[2];
 167                 }
 168             } else {
 169                 left_xy[1] += s->mb_stride;
 170                 left_block = left_block_options[3];
 171             }
 172         }
 173     }
 174
 175     h->top_mb_xy = top_xy;
 176     h->left_mb_xy[0] = left_xy[0];
 177     h->left_mb_xy[1] = left_xy[1];
 178     if(for_deblock){
 179         topleft_type = 0;
 180         topright_type = 0;
 181         top_type     = h->slice_table[top_xy     ] < 0xFFFF ? s->current_picture.mb_type[top_xy]     : 0;
 182         left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0;
 183         left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0;
 184
 185         if(MB_MBAFF && !IS_INTRA(mb_type)){
 186             int list;
 187             for(list=0; list<h->list_count; list++){
 188                 //These values where changed for ease of performing MC, we need to change them back
 189                 //FIXME maybe we can make MC and loop filter use the same values or prevent
 190                 //the MC code from changing ref_cache and rather use a temporary array.
 191                 if(USES_LIST(mb_type,list)){
 192                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 193                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 194                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 195                     ref += h->b8_stride;
 196                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 197                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 198                 }
 199             }
 200         }
 201     }else{
 202         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 203         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 204         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 205         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 206         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 207
 208     if(IS_INTRA(mb_type)){
 209         int type_mask= h->pps.constrained_intra_pred ? IS_INTRA(-1) : -1;
 210         h->topleft_samples_available=
 211         h->top_samples_available=
 212         h->left_samples_available= 0xFFFF;
 213         h->topright_samples_available= 0xEEEA;
 214
 215         if(!(top_type & type_mask)){
 216             h->topleft_samples_available= 0xB3FF;
 217             h->top_samples_available= 0x33FF;
 218             h->topright_samples_available= 0x26EA;
 219         }
 220         if(IS_INTERLACED(mb_type) != IS_INTERLACED(left_type[0])){
 221             if(IS_INTERLACED(mb_type)){
 222                 if(!(left_type[0] & type_mask)){
 223                     h->topleft_samples_available&= 0xDFFF;
 224                     h->left_samples_available&= 0x5FFF;
 225                 }
 226                 if(!(left_type[1] & type_mask)){
 227                     h->topleft_samples_available&= 0xFF5F;
 228                     h->left_samples_available&= 0xFF5F;
 229                 }
 230             }else{
 231                 int left_typei = h->slice_table[left_xy[0] + s->mb_stride ] == h->slice_num
 232                                 ? s->current_picture.mb_type[left_xy[0] + s->mb_stride] : 0;
 233                 assert(left_xy[0] == left_xy[1]);
 234                 if(!((left_typei & type_mask) && (left_type[0] & type_mask))){
 235                     h->topleft_samples_available&= 0xDF5F;
 236                     h->left_samples_available&= 0x5F5F;
 237                 }
 238             }
 239         }else{
 240             if(!(left_type[0] & type_mask)){
 241                 h->topleft_samples_available&= 0xDF5F;
 242                 h->left_samples_available&= 0x5F5F;
 243             }
 244         }
 245
 246         if(!(topleft_type & type_mask))
 247             h->topleft_samples_available&= 0x7FFF;
 248
 249         if(!(topright_type & type_mask))
 250             h->topright_samples_available&= 0xFBFF;
 251
 252         if(IS_INTRA4x4(mb_type)){
 253             if(IS_INTRA4x4(top_type)){
 254                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 255                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 256                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 257                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 258             }else{
 259                 int pred;
 260                 if(!(top_type & type_mask))
 261                     pred= -1;
 262                 else{
 263                     pred= 2;
 264                 }
 265                 h->intra4x4_pred_mode_cache[4+8*0]=
 266                 h->intra4x4_pred_mode_cache[5+8*0]=
 267                 h->intra4x4_pred_mode_cache[6+8*0]=
 268                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 269             }
 270             for(i=0; i<2; i++){
 271                 if(IS_INTRA4x4(left_type[i])){
 272                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 273                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 274                 }else{
 275                     int pred;
 276                     if(!(left_type[i] & type_mask))
 277                         pred= -1;
 278                     else{
 279                         pred= 2;
 280                     }
 281                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 282                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 283                 }
 284             }
 285         }
 286     }
 287     }
 288
 289
 290 /*
 291 0 . T T. T T T T
 292 1 L . .L . . . .
 293 2 L . .L . . . .
 294 3 . T TL . . . .
 295 4 L . .L . . . .
 296 5 L . .. . . . .
 297 */
 298 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
 299     if(top_type){
 300         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 301         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 302         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 303         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 304
 305         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 306         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 307
 308         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 309         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 310
 311     }else{
 312         h->non_zero_count_cache[4+8*0]=
 313         h->non_zero_count_cache[5+8*0]=
 314         h->non_zero_count_cache[6+8*0]=
 315         h->non_zero_count_cache[7+8*0]=
 316
 317         h->non_zero_count_cache[1+8*0]=
 318         h->non_zero_count_cache[2+8*0]=
 319
 320         h->non_zero_count_cache[1+8*3]=
 321         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 322
 323     }
 324
 325     for (i=0; i<2; i++) {
 326         if(left_type[i]){
 327             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 328             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 329             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 330             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 331         }else{
 332             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 333             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 334             h->non_zero_count_cache[0+8*1 +   8*i]=
 335             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 336         }
 337     }
 338
 339     if( h->pps.cabac ) {
 340         // top_cbp
 341         if(top_type) {
 342             h->top_cbp = h->cbp_table[top_xy];
 343         } else if(IS_INTRA(mb_type)) {
 344             h->top_cbp = 0x1C0;
 345         } else {
 346             h->top_cbp = 0;
 347         }
 348         // left_cbp
 349         if (left_type[0]) {
 350             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 351         } else if(IS_INTRA(mb_type)) {
 352             h->left_cbp = 0x1C0;
 353         } else {
 354             h->left_cbp = 0;
 355         }
 356         if (left_type[0]) {
 357             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 358         }
 359         if (left_type[1]) {
 360             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 361         }
 362     }
 363
 364 #if 1
 365     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 366         int list;
 367         for(list=0; list<h->list_count; list++){
 368             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 369                 /*if(!h->mv_cache_clean[list]){
 370                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 371                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 372                     h->mv_cache_clean[list]= 1;
 373                 }*/
 374                 continue;
 375             }
 376             h->mv_cache_clean[list]= 0;
 377
 378             if(USES_LIST(top_type, list)){
 379                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 380                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 381                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 382                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 383                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 384                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 385                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 386                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 387                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 388                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 389             }else{
 390                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 391                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 392                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 393                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 394                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 395             }
 396
 397             for(i=0; i<2; i++){
 398                 int cache_idx = scan8[0] - 1 + i*2*8;
 399                 if(USES_LIST(left_type[i], list)){
 400                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 401                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 402                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 403                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 404                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 405                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 406                 }else{
 407                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 408                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 409                     h->ref_cache[list][cache_idx  ]=
 410                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 411                 }
 412             }
 413
 414             if(for_deblock || ((IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred) && !FRAME_MBAFF))
 415                 continue;
 416
 417             if(USES_LIST(topleft_type, list)){
 418                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 419                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 420                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 421                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 422             }else{
 423                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 424                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 425             }
 426
 427             if(USES_LIST(topright_type, list)){
 428                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 429                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 430                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 431                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 432             }else{
 433                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 434                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 435             }
 436
 437             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 438                 continue;
 439
 440             h->ref_cache[list][scan8[5 ]+1] =
 441             h->ref_cache[list][scan8[7 ]+1] =
 442             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 443             h->ref_cache[list][scan8[4 ]] =
 444             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 445             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 446             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 447             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 448             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 449             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 450
 451             if( h->pps.cabac ) {
 452                 /* XXX beurk, Load mvd */
 453                 if(USES_LIST(top_type, list)){
 454                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 455                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 456                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 457                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 458                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 459                 }else{
 460                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 461                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 462                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 463                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 464                 }
 465                 if(USES_LIST(left_type[0], list)){
 466                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 467                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 468                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 469                 }else{
 470                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 471                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 472                 }
 473                 if(USES_LIST(left_type[1], list)){
 474                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 475                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 476                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 477                 }else{
 478                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 479                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 480                 }
 481                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 482                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 483                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 484                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 485                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 486
 487                 if(h->slice_type_nos == FF_B_TYPE){
 488                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 489
 490                     if(IS_DIRECT(top_type)){
 491                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 492                     }else if(IS_8X8(top_type)){
 493                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 494                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 495                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 496                     }else{
 497                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 498                     }
 499
 500                     if(IS_DIRECT(left_type[0]))
 501                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 502                     else if(IS_8X8(left_type[0]))
 503                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 504                     else
 505                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 506
 507                     if(IS_DIRECT(left_type[1]))
 508                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 509                     else if(IS_8X8(left_type[1]))
 510                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 511                     else
 512                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 513                 }
 514             }
 515
 516             if(FRAME_MBAFF){
 517 #define MAP_MVS\
 518                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 519                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 520                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 521                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 522                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 523                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 524                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 525                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 526                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 527                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 528                 if(MB_FIELD){
 529 #define MAP_F2F(idx, mb_type)\
 530                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 531                         h->ref_cache[list][idx] <<= 1;\
 532                         h->mv_cache[list][idx][1] /= 2;\
 533                         h->mvd_cache[list][idx][1] /= 2;\
 534                     }
 535                     MAP_MVS
 536 #undef MAP_F2F
 537                 }else{
 538 #define MAP_F2F(idx, mb_type)\
 539                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 540                         h->ref_cache[list][idx] >>= 1;\
 541                         h->mv_cache[list][idx][1] <<= 1;\
 542                         h->mvd_cache[list][idx][1] <<= 1;\
 543                     }
 544                     MAP_MVS
 545 #undef MAP_F2F
 546                 }
 547             }
 548         }
 549     }
 550 #endif
 551
 552     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 553 }
 554
 555 static inline void write_back_intra_pred_mode(H264Context *h){
 556     const int mb_xy= h->mb_xy;
 557
 558     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 559     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 560     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 561     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 562     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 563     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 564     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 565 }
 566
 567 /**
 568  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 569  */
 570 static inline int check_intra4x4_pred_mode(H264Context *h){
 571     MpegEncContext * const s = &h->s;
 572     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 573     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 574     int i;
 575
 576     if(!(h->top_samples_available&0x8000)){
 577         for(i=0; i<4; i++){
 578             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 579             if(status<0){
 580                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 581                 return -1;
 582             } else if(status){
 583                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 584             }
 585         }
 586     }
 587
 588     if((h->left_samples_available&0x8888)!=0x8888){
 589         static const int mask[4]={0x8000,0x2000,0x80,0x20};
 590         for(i=0; i<4; i++){
 591             if(!(h->left_samples_available&mask[i])){
 592             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 593             if(status<0){
 594                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 595                 return -1;
 596             } else if(status){
 597                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 598             }
 599             }
 600         }
 601     }
 602
 603     return 0;
 604 } //FIXME cleanup like next
 605
 606 /**
 607  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 608  */
 609 static inline int check_intra_pred_mode(H264Context *h, int mode){
 610     MpegEncContext * const s = &h->s;
 611     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 612     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 613
 614     if(mode > 6U) {
 615         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 616         return -1;
 617     }
 618
 619     if(!(h->top_samples_available&0x8000)){
 620         mode= top[ mode ];
 621         if(mode<0){
 622             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 623             return -1;
 624         }
 625     }
 626
 627     if((h->left_samples_available&0x8080) != 0x8080){
 628         mode= left[ mode ];
 629         if(h->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred
 630             mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(h->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8);
 631         }
 632         if(mode<0){
 633             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 634             return -1;
 635         }
 636     }
 637
 638     return mode;
 639 }
 640
 641 /**
 642  * gets the predicted intra4x4 prediction mode.
 643  */
 644 static inline int pred_intra_mode(H264Context *h, int n){
 645     const int index8= scan8[n];
 646     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 647     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 648     const int min= FFMIN(left, top);
 649
 650     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 651
 652     if(min<0) return DC_PRED;
 653     else      return min;
 654 }
 655
 656 static inline void write_back_non_zero_count(H264Context *h){
 657     const int mb_xy= h->mb_xy;
 658
 659     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 660     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 661     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 662     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 663     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 664     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 665     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 666
 667     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 668     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 669     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 670
 671     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 672     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 673     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 674 }
 675
 676 /**
 677  * gets the predicted number of non-zero coefficients.
 678  * @param n block index
 679  */
 680 static inline int pred_non_zero_count(H264Context *h, int n){
 681     const int index8= scan8[n];
 682     const int left= h->non_zero_count_cache[index8 - 1];
 683     const int top = h->non_zero_count_cache[index8 - 8];
 684     int i= left + top;
 685
 686     if(i<64) i= (i+1)>>1;
 687
 688     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 689
 690     return i&31;
 691 }
 692
 693 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 694     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 695     MpegEncContext *s = &h->s;
 696
 697     /* there is no consistent mapping of mvs to neighboring locations that will
 698      * make mbaff happy, so we can't move all this logic to fill_caches */
 699     if(FRAME_MBAFF){
 700         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 701         const int16_t *mv;
 702         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 703         *C = h->mv_cache[list][scan8[0]-2];
 704
 705         if(!MB_FIELD
 706            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 707             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 708             if(IS_INTERLACED(mb_types[topright_xy])){
 709 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 710                 const int x4 = X4, y4 = Y4;\
 711                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 712                 if(!USES_LIST(mb_type,list))\
 713                     return LIST_NOT_USED;\
 714                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 715                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 716                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 717                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 718
 719                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 720             }
 721         }
 722         if(topright_ref == PART_NOT_AVAILABLE
 723            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 724            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 725             if(!MB_FIELD
 726                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 727                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 728             }
 729             if(MB_FIELD
 730                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 731                && i >= scan8[0]+8){
 732                 // left shift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's OK.
 733                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 734             }
 735         }
 736 #undef SET_DIAG_MV
 737     }
 738
 739     if(topright_ref != PART_NOT_AVAILABLE){
 740         *C= h->mv_cache[list][ i - 8 + part_width ];
 741         return topright_ref;
 742     }else{
 743         tprintf(s->avctx, "topright MV not available\n");
 744
 745         *C= h->mv_cache[list][ i - 8 - 1 ];
 746         return h->ref_cache[list][ i - 8 - 1 ];
 747     }
 748 }
 749
 750 /**
 751  * gets the predicted MV.
 752  * @param n the block index
 753  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 754  * @param mx the x component of the predicted motion vector
 755  * @param my the y component of the predicted motion vector
 756  */
 757 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 758     const int index8= scan8[n];
 759     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 760     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 761     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 762     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 763     const int16_t * C;
 764     int diagonal_ref, match_count;
 765
 766     assert(part_width==1 || part_width==2 || part_width==4);
 767
 768 /* mv_cache
 769   B . . A T T T T
 770   U . . L . . , .
 771   U . . L . . . .
 772   U . . L . . , .
 773   . . . L . . . .
 774 */
 775
 776     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 777     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 778     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 779     if(match_count > 1){ //most common
 780         *mx= mid_pred(A[0], B[0], C[0]);
 781         *my= mid_pred(A[1], B[1], C[1]);
 782     }else if(match_count==1){
 783         if(left_ref==ref){
 784             *mx= A[0];
 785             *my= A[1];
 786         }else if(top_ref==ref){
 787             *mx= B[0];
 788             *my= B[1];
 789         }else{
 790             *mx= C[0];
 791             *my= C[1];
 792         }
 793     }else{
 794         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 795             *mx= A[0];
 796             *my= A[1];
 797         }else{
 798             *mx= mid_pred(A[0], B[0], C[0]);
 799             *my= mid_pred(A[1], B[1], C[1]);
 800         }
 801     }
 802
 803     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 804 }
 805
 806 /**
 807  * gets the directionally predicted 16x8 MV.
 808  * @param n the block index
 809  * @param mx the x component of the predicted motion vector
 810  * @param my the y component of the predicted motion vector
 811  */
 812 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 813     if(n==0){
 814         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 815         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 816
 817         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 818
 819         if(top_ref == ref){
 820             *mx= B[0];
 821             *my= B[1];
 822             return;
 823         }
 824     }else{
 825         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 826         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 827
 828         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 829
 830         if(left_ref == ref){
 831             *mx= A[0];
 832             *my= A[1];
 833             return;
 834         }
 835     }
 836
 837     //RARE
 838     pred_motion(h, n, 4, list, ref, mx, my);
 839 }
 840
 841 /**
 842  * gets the directionally predicted 8x16 MV.
 843  * @param n the block index
 844  * @param mx the x component of the predicted motion vector
 845  * @param my the y component of the predicted motion vector
 846  */
 847 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 848     if(n==0){
 849         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 850         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 851
 852         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 853
 854         if(left_ref == ref){
 855             *mx= A[0];
 856             *my= A[1];
 857             return;
 858         }
 859     }else{
 860         const int16_t * C;
 861         int diagonal_ref;
 862
 863         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 864
 865         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 866
 867         if(diagonal_ref == ref){
 868             *mx= C[0];
 869             *my= C[1];
 870             return;
 871         }
 872     }
 873
 874     //RARE
 875     pred_motion(h, n, 2, list, ref, mx, my);
 876 }
 877
 878 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 879     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 880     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 881
 882     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 883
 884     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 885        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
 886        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
 887
 888         *mx = *my = 0;
 889         return;
 890     }
 891
 892     pred_motion(h, 0, 4, 0, 0, mx, my);
 893
 894     return;
 895 }
 896
 897 static int get_scale_factor(H264Context * const h, int poc, int poc1, int i){
 898     int poc0 = h->ref_list[0][i].poc;
 899     int td = av_clip(poc1 - poc0, -128, 127);
 900     if(td == 0 || h->ref_list[0][i].long_ref){
 901         return 256;
 902     }else{
 903         int tb = av_clip(poc - poc0, -128, 127);
 904         int tx = (16384 + (FFABS(td) >> 1)) / td;
 905         return av_clip((tb*tx + 32) >> 6, -1024, 1023);
 906     }
 907 }
 908
 909 static inline void direct_dist_scale_factor(H264Context * const h){
 910     MpegEncContext * const s = &h->s;
 911     const int poc = h->s.current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
 912     const int poc1 = h->ref_list[1][0].poc;
 913     int i, field;
 914     for(field=0; field<2; field++){
 915         const int poc  = h->s.current_picture_ptr->field_poc[field];
 916         const int poc1 = h->ref_list[1][0].field_poc[field];
 917         for(i=0; i < 2*h->ref_count[0]; i++)
 918             h->dist_scale_factor_field[field][i^field] = get_scale_factor(h, poc, poc1, i+16);
 919     }
 920
 921     for(i=0; i<h->ref_count[0]; i++){
 922         h->dist_scale_factor[i] = get_scale_factor(h, poc, poc1, i);
 923     }
 924 }
 925
 926 static void fill_colmap(H264Context *h, int map[2][16+32], int list, int field, int colfield, int mbafi){
 927     MpegEncContext * const s = &h->s;
 928     Picture * const ref1 = &h->ref_list[1][0];
 929     int j, old_ref, rfield;
 930     int start= mbafi ? 16                      : 0;
 931     int end  = mbafi ? 16+2*h->ref_count[list] : h->ref_count[list];
 932     int interl= mbafi || s->picture_structure != PICT_FRAME;
 933
 934     /* bogus; fills in for missing frames */
 935     memset(map[list], 0, sizeof(map[list]));
 936
 937     for(rfield=0; rfield<2; rfield++){
 938         for(old_ref=0; old_ref<ref1->ref_count[colfield][list]; old_ref++){
 939             int poc = ref1->ref_poc[colfield][list][old_ref];
 940
 941             if     (!interl)
 942                 poc |= 3;
 943             else if( interl && (poc&3) == 3) //FIXME store all MBAFF references so this isnt needed
 944                 poc= (poc&~3) + rfield + 1;
 945
 946             for(j=start; j<end; j++){
 947                 if(4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3) == poc){
 948                     int cur_ref= mbafi ? (j-16)^field : j;
 949                     map[list][2*old_ref + (rfield^field) + 16] = cur_ref;
 950                     if(rfield == field)
 951                         map[list][old_ref] = cur_ref;
 952                     break;
 953                 }
 954             }
 955         }
 956     }
 957 }
 958
 959 static inline void direct_ref_list_init(H264Context * const h){
 960     MpegEncContext * const s = &h->s;
 961     Picture * const ref1 = &h->ref_list[1][0];
 962     Picture * const cur = s->current_picture_ptr;
 963     int list, j, field;
 964     int sidx= (s->picture_structure&1)^1;
 965     int ref1sidx= (ref1->reference&1)^1;
 966
 967     for(list=0; list<2; list++){
 968         cur->ref_count[sidx][list] = h->ref_count[list];
 969         for(j=0; j<h->ref_count[list]; j++)
 970             cur->ref_poc[sidx][list][j] = 4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3);
 971     }
 972
 973     if(s->picture_structure == PICT_FRAME){
 974         memcpy(cur->ref_count[1], cur->ref_count[0], sizeof(cur->ref_count[0]));
 975         memcpy(cur->ref_poc  [1], cur->ref_poc  [0], sizeof(cur->ref_poc  [0]));
 976     }
 977
 978     cur->mbaff= FRAME_MBAFF;
 979
 980     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 981         return;
 982
 983     for(list=0; list<2; list++){
 984         fill_colmap(h, h->map_col_to_list0, list, sidx, ref1sidx, 0);
 985         for(field=0; field<2; field++)
 986             fill_colmap(h, h->map_col_to_list0_field[field], list, field, field, 1);
 987     }
 988 }
 989
 990 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 991     MpegEncContext * const s = &h->s;
 992     int b8_stride = h->b8_stride;
 993     int b4_stride = h->b_stride;
 994     int mb_xy = h->mb_xy;
 995     int mb_type_col[2];
 996     const int16_t (*l1mv0)[2], (*l1mv1)[2];
 997     const int8_t *l1ref0, *l1ref1;
 998     const int is_b8x8 = IS_8X8(*mb_type);
 999     unsigned int sub_mb_type;
1000     int i8, i4;
1001
1002 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
1003
1004     if(IS_INTERLACED(h->ref_list[1][0].mb_type[mb_xy])){ // AFL/AFR/FR/FL -> AFL/FL
1005         if(!IS_INTERLACED(*mb_type)){                    //     AFR/FR    -> AFL/FL
1006             int cur_poc = s->current_picture_ptr->poc;
1007             int *col_poc = h->ref_list[1]->field_poc;
1008             int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
1009             mb_xy= s->mb_x + ((s->mb_y&~1) + col_parity)*s->mb_stride;
1010             b8_stride = 0;
1011         }else if(!(s->picture_structure & h->ref_list[1][0].reference) && !h->ref_list[1][0].mbaff){// FL -> FL & differ parity
1012             int fieldoff= 2*(h->ref_list[1][0].reference)-3;
1013             mb_xy += s->mb_stride*fieldoff;
1014         }
1015         goto single_col;
1016     }else{                                               // AFL/AFR/FR/FL -> AFR/FR
1017         if(IS_INTERLACED(*mb_type)){                     // AFL       /FL -> AFR/FR
1018             mb_xy= s->mb_x + (s->mb_y&~1)*s->mb_stride;
1019             mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy];
1020             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride];
1021             b8_stride *= 3;
1022             b4_stride *= 6;
1023             //FIXME IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag
1024             if(    (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)
1025                 && (mb_type_col[1] & MB_TYPE_16x16_OR_INTRA)
1026                 && !is_b8x8){
1027                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1028                 *mb_type   |= MB_TYPE_16x8 |MB_TYPE_L0L1|MB_TYPE_DIRECT2; /* B_16x8 */
1029             }else{
1030                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1031                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1032             }
1033         }else{                                           //     AFR/FR    -> AFR/FR
1034 single_col:
1035             mb_type_col[0] =
1036             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy];
1037             if(IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag){
1038                 /* FIXME save sub mb types from previous frames (or derive from MVs)
1039                 * so we know exactly what block size to use */
1040                 sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1041                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1042             }else if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
1043                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1044                 *mb_type   |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1045             }else{
1046                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1047                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1048             }
1049         }
1050     }
1051
1052     l1mv0  = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]];
1053     l1mv1  = &h->ref_list[1][0].motion_val[1][h->mb2b_xy [mb_xy]];
1054     l1ref0 = &h->ref_list[1][0].ref_index [0][h->mb2b8_xy[mb_xy]];
1055     l1ref1 = &h->ref_list[1][0].ref_index [1][h->mb2b8_xy[mb_xy]];
1056     if(!b8_stride){
1057         if(s->mb_y&1){
1058             l1ref0 += h->b8_stride;
1059             l1ref1 += h->b8_stride;
1060             l1mv0  +=  2*b4_stride;
1061             l1mv1  +=  2*b4_stride;
1062         }
1063     }
1064
1065     if(h->direct_spatial_mv_pred){
1066         int ref[2];
1067         int mv[2][2];
1068         int list;
1069
1070         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1071
1072         /* ref = min(neighbors) */
1073         for(list=0; list<2; list++){
1074             int refa = h->ref_cache[list][scan8[0] - 1];
1075             int refb = h->ref_cache[list][scan8[0] - 8];
1076             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1077             if(refc == PART_NOT_AVAILABLE)
1078                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1079             ref[list] = FFMIN3((unsigned)refa, (unsigned)refb, (unsigned)refc);
1080             if(ref[list] < 0)
1081                 ref[list] = -1;
1082         }
1083
1084         if(ref[0] < 0 && ref[1] < 0){
1085             ref[0] = ref[1] = 0;
1086             mv[0][0] = mv[0][1] =
1087             mv[1][0] = mv[1][1] = 0;
1088         }else{
1089             for(list=0; list<2; list++){
1090                 if(ref[list] >= 0)
1091                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1092                 else
1093                     mv[list][0] = mv[list][1] = 0;
1094             }
1095         }
1096
1097         if(ref[1] < 0){
1098             if(!is_b8x8)
1099                 *mb_type &= ~MB_TYPE_L1;
1100             sub_mb_type &= ~MB_TYPE_L1;
1101         }else if(ref[0] < 0){
1102             if(!is_b8x8)
1103                 *mb_type &= ~MB_TYPE_L0;
1104             sub_mb_type &= ~MB_TYPE_L0;
1105         }
1106
1107         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1108             for(i8=0; i8<4; i8++){
1109                 int x8 = i8&1;
1110                 int y8 = i8>>1;
1111                 int xy8 = x8+y8*b8_stride;
1112                 int xy4 = 3*x8+y8*b4_stride;
1113                 int a=0, b=0;
1114
1115                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1116                     continue;
1117                 h->sub_mb_type[i8] = sub_mb_type;
1118
1119                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1120                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1121                 if(!IS_INTRA(mb_type_col[y8])
1122                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1123                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1124                     if(ref[0] > 0)
1125                         a= pack16to32(mv[0][0],mv[0][1]);
1126                     if(ref[1] > 0)
1127                         b= pack16to32(mv[1][0],mv[1][1]);
1128                 }else{
1129                     a= pack16to32(mv[0][0],mv[0][1]);
1130                     b= pack16to32(mv[1][0],mv[1][1]);
1131                 }
1132                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1133                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1134             }
1135         }else if(IS_16X16(*mb_type)){
1136             int a=0, b=0;
1137
1138             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1139             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1140             if(!IS_INTRA(mb_type_col[0])
1141                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1142                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1143                        && (h->x264_build>33 || !h->x264_build)))){
1144                 if(ref[0] > 0)
1145                     a= pack16to32(mv[0][0],mv[0][1]);
1146                 if(ref[1] > 0)
1147                     b= pack16to32(mv[1][0],mv[1][1]);
1148             }else{
1149                 a= pack16to32(mv[0][0],mv[0][1]);
1150                 b= pack16to32(mv[1][0],mv[1][1]);
1151             }
1152             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1153             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1154         }else{
1155             for(i8=0; i8<4; i8++){
1156                 const int x8 = i8&1;
1157                 const int y8 = i8>>1;
1158
1159                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1160                     continue;
1161                 h->sub_mb_type[i8] = sub_mb_type;
1162
1163                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1164                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1165                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1166                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1167
1168                 /* col_zero_flag */
1169                 if(!IS_INTRA(mb_type_col[0]) && (   l1ref0[x8 + y8*b8_stride] == 0
1170                                               || (l1ref0[x8 + y8*b8_stride] < 0 && l1ref1[x8 + y8*b8_stride] == 0
1171                                                   && (h->x264_build>33 || !h->x264_build)))){
1172                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*b8_stride] == 0 ? l1mv0 : l1mv1;
1173                     if(IS_SUB_8X8(sub_mb_type)){
1174                         const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1175                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1176                             if(ref[0] == 0)
1177                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1178                             if(ref[1] == 0)
1179                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1180                         }
1181                     }else
1182                     for(i4=0; i4<4; i4++){
1183                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1184                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1185                             if(ref[0] == 0)
1186                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1187                             if(ref[1] == 0)
1188                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1189                         }
1190                     }
1191                 }
1192             }
1193         }
1194     }else{ /* direct temporal mv pred */
1195         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1196         const int *dist_scale_factor = h->dist_scale_factor;
1197         int ref_offset= 0;
1198
1199         if(FRAME_MBAFF && IS_INTERLACED(*mb_type)){
1200             map_col_to_list0[0] = h->map_col_to_list0_field[s->mb_y&1][0];
1201             map_col_to_list0[1] = h->map_col_to_list0_field[s->mb_y&1][1];
1202             dist_scale_factor   =h->dist_scale_factor_field[s->mb_y&1];
1203         }
1204         if(h->ref_list[1][0].mbaff && IS_INTERLACED(mb_type_col[0]))
1205             ref_offset += 16;
1206
1207         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1208             /* FIXME assumes direct_8x8_inference == 1 */
1209             int y_shift  = 2*!IS_INTERLACED(*mb_type);
1210
1211             for(i8=0; i8<4; i8++){
1212                 const int x8 = i8&1;
1213                 const int y8 = i8>>1;
1214                 int ref0, scale;
1215                 const int16_t (*l1mv)[2]= l1mv0;
1216
1217                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1218                     continue;
1219                 h->sub_mb_type[i8] = sub_mb_type;
1220
1221                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1222                 if(IS_INTRA(mb_type_col[y8])){
1223                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1224                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1225                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1226                     continue;
1227                 }
1228
1229                 ref0 = l1ref0[x8 + y8*b8_stride];
1230                 if(ref0 >= 0)
1231                     ref0 = map_col_to_list0[0][ref0 + ref_offset];
1232                 else{
1233                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1234                     l1mv= l1mv1;
1235                 }
1236                 scale = dist_scale_factor[ref0];
1237                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1238
1239                 {
1240                     const int16_t *mv_col = l1mv[x8*3 + y8*b4_stride];
1241                     int my_col = (mv_col[1]<<y_shift)/2;
1242                     int mx = (scale * mv_col[0] + 128) >> 8;
1243                     int my = (scale * my_col + 128) >> 8;
1244                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1245                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1246                 }
1247             }
1248             return;
1249         }
1250
1251         /* one-to-one mv scaling */
1252
1253         if(IS_16X16(*mb_type)){
1254             int ref, mv0, mv1;
1255
1256             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1257             if(IS_INTRA(mb_type_col[0])){
1258                 ref=mv0=mv1=0;
1259             }else{
1260                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0] + ref_offset]
1261                                                 : map_col_to_list0[1][l1ref1[0] + ref_offset];
1262                 const int scale = dist_scale_factor[ref0];
1263                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1264                 int mv_l0[2];
1265                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1266                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1267                 ref= ref0;
1268                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1269                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1270             }
1271             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1272             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1273             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1274         }else{
1275             for(i8=0; i8<4; i8++){
1276                 const int x8 = i8&1;
1277                 const int y8 = i8>>1;
1278                 int ref0, scale;
1279                 const int16_t (*l1mv)[2]= l1mv0;
1280
1281                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1282                     continue;
1283                 h->sub_mb_type[i8] = sub_mb_type;
1284                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1285                 if(IS_INTRA(mb_type_col[0])){
1286                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1287                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1288                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1289                     continue;
1290                 }
1291
1292                 ref0 = l1ref0[x8 + y8*b8_stride] + ref_offset;
1293                 if(ref0 >= 0)
1294                     ref0 = map_col_to_list0[0][ref0];
1295                 else{
1296                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1297                     l1mv= l1mv1;
1298                 }
1299                 scale = dist_scale_factor[ref0];
1300
1301                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1302                 if(IS_SUB_8X8(sub_mb_type)){
1303                     const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1304                     int mx = (scale * mv_col[0] + 128) >> 8;
1305                     int my = (scale * mv_col[1] + 128) >> 8;
1306                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1307                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1308                 }else
1309                 for(i4=0; i4<4; i4++){
1310                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1311                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1312                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1313                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1314                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1315                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1316                 }
1317             }
1318         }
1319     }
1320 }
1321
1322 static inline void write_back_motion(H264Context *h, int mb_type){
1323     MpegEncContext * const s = &h->s;
1324     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1325     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1326     int list;
1327
1328     if(!USES_LIST(mb_type, 0))
1329         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1330
1331     for(list=0; list<h->list_count; list++){
1332         int y;
1333         if(!USES_LIST(mb_type, list))
1334             continue;
1335
1336         for(y=0; y<4; y++){
1337             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1338             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1339         }
1340         if( h->pps.cabac ) {
1341             if(IS_SKIP(mb_type))
1342                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1343             else
1344             for(y=0; y<4; y++){
1345                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1346                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1347             }
1348         }
1349
1350         {
1351             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1352             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1353             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1354             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1355             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1356         }
1357     }
1358
1359     if(h->slice_type_nos == FF_B_TYPE && h->pps.cabac){
1360         if(IS_8X8(mb_type)){
1361             uint8_t *direct_table = &h->direct_table[b8_xy];
1362             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1363             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1364             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1365         }
1366     }
1367 }
1368
1369 /**
1370  * Decodes a network abstraction layer unit.
1371  * @param consumed is the number of bytes used as input
1372  * @param length is the length of the array
1373  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1374  * @returns decoded bytes, might be src+1 if no escapes
1375  */
1376 static const uint8_t *decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1377     int i, si, di;
1378     uint8_t *dst;
1379     int bufidx;
1380
1381 //    src[0]&0x80;                //forbidden bit
1382     h->nal_ref_idc= src[0]>>5;
1383     h->nal_unit_type= src[0]&0x1F;
1384
1385     src++; length--;
1386 #if 0
1387     for(i=0; i<length; i++)
1388         printf("%2X ", src[i]);
1389 #endif
1390     for(i=0; i+1<length; i+=2){
1391         if(src[i]) continue;
1392         if(i>0 && src[i-1]==0) i--;
1393         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1394             if(src[i+2]!=3){
1395                 /* startcode, so we must be past the end */
1396                 length=i;
1397             }
1398             break;
1399         }
1400     }
1401
1402     if(i>=length-1){ //no escaped 0
1403         *dst_length= length;
1404         *consumed= length+1; //+1 for the header
1405         return src;
1406     }
1407
1408     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1409     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length+FF_INPUT_BUFFER_PADDING_SIZE);
1410     dst= h->rbsp_buffer[bufidx];
1411
1412     if (dst == NULL){
1413         return NULL;
1414     }
1415
1416 //printf("decoding esc\n");
1417     si=di=0;
1418     while(si<length){
1419         //remove escapes (very rare 1:2^22)
1420         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1421             if(src[si+2]==3){ //escape
1422                 dst[di++]= 0;
1423                 dst[di++]= 0;
1424                 si+=3;
1425                 continue;
1426             }else //next start code
1427                 break;
1428         }
1429
1430         dst[di++]= src[si++];
1431     }
1432
1433     memset(dst+di, 0, FF_INPUT_BUFFER_PADDING_SIZE);
1434
1435     *dst_length= di;
1436     *consumed= si + 1;//+1 for the header
1437 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1438     return dst;
1439 }
1440
1441 /**
1442  * identifies the exact end of the bitstream
1443  * @return the length of the trailing, or 0 if damaged
1444  */
1445 static int decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1446     int v= *src;
1447     int r;
1448
1449     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1450
1451     for(r=1; r<9; r++){
1452         if(v&1) return r;
1453         v>>=1;
1454     }
1455     return 0;
1456 }
1457
1458 /**
1459  * IDCT transforms the 16 dc values and dequantizes them.
1460  * @param qp quantization parameter
1461  */
1462 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1463 #define stride 16
1464     int i;
1465     int temp[16]; //FIXME check if this is a good idea
1466     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1467     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1468
1469 //memset(block, 64, 2*256);
1470 //return;
1471     for(i=0; i<4; i++){
1472         const int offset= y_offset[i];
1473         const int z0= block[offset+stride*0] + block[offset+stride*4];
1474         const int z1= block[offset+stride*0] - block[offset+stride*4];
1475         const int z2= block[offset+stride*1] - block[offset+stride*5];
1476         const int z3= block[offset+stride*1] + block[offset+stride*5];
1477
1478         temp[4*i+0]= z0+z3;
1479         temp[4*i+1]= z1+z2;
1480         temp[4*i+2]= z1-z2;
1481         temp[4*i+3]= z0-z3;
1482     }
1483
1484     for(i=0; i<4; i++){
1485         const int offset= x_offset[i];
1486         const int z0= temp[4*0+i] + temp[4*2+i];
1487         const int z1= temp[4*0+i] - temp[4*2+i];
1488         const int z2= temp[4*1+i] - temp[4*3+i];
1489         const int z3= temp[4*1+i] + temp[4*3+i];
1490
1491         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
1492         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1493         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1494         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1495     }
1496 }
1497
1498 #if 0
1499 /**
1500  * DCT transforms the 16 dc values.
1501  * @param qp quantization parameter ??? FIXME
1502  */
1503 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1504 //    const int qmul= dequant_coeff[qp][0];
1505     int i;
1506     int temp[16]; //FIXME check if this is a good idea
1507     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1508     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1509
1510     for(i=0; i<4; i++){
1511         const int offset= y_offset[i];
1512         const int z0= block[offset+stride*0] + block[offset+stride*4];
1513         const int z1= block[offset+stride*0] - block[offset+stride*4];
1514         const int z2= block[offset+stride*1] - block[offset+stride*5];
1515         const int z3= block[offset+stride*1] + block[offset+stride*5];
1516
1517         temp[4*i+0]= z0+z3;
1518         temp[4*i+1]= z1+z2;
1519         temp[4*i+2]= z1-z2;
1520         temp[4*i+3]= z0-z3;
1521     }
1522
1523     for(i=0; i<4; i++){
1524         const int offset= x_offset[i];
1525         const int z0= temp[4*0+i] + temp[4*2+i];
1526         const int z1= temp[4*0+i] - temp[4*2+i];
1527         const int z2= temp[4*1+i] - temp[4*3+i];
1528         const int z3= temp[4*1+i] + temp[4*3+i];
1529
1530         block[stride*0 +offset]= (z0 + z3)>>1;
1531         block[stride*2 +offset]= (z1 + z2)>>1;
1532         block[stride*8 +offset]= (z1 - z2)>>1;
1533         block[stride*10+offset]= (z0 - z3)>>1;
1534     }
1535 }
1536 #endif
1537
1538 #undef xStride
1539 #undef stride
1540
1541 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1542     const int stride= 16*2;
1543     const int xStride= 16;
1544     int a,b,c,d,e;
1545
1546     a= block[stride*0 + xStride*0];
1547     b= block[stride*0 + xStride*1];
1548     c= block[stride*1 + xStride*0];
1549     d= block[stride*1 + xStride*1];
1550
1551     e= a-b;
1552     a= a+b;
1553     b= c-d;
1554     c= c+d;
1555
1556     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1557     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1558     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1559     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1560 }
1561
1562 #if 0
1563 static void chroma_dc_dct_c(DCTELEM *block){
1564     const int stride= 16*2;
1565     const int xStride= 16;
1566     int a,b,c,d,e;
1567
1568     a= block[stride*0 + xStride*0];
1569     b= block[stride*0 + xStride*1];
1570     c= block[stride*1 + xStride*0];
1571     d= block[stride*1 + xStride*1];
1572
1573     e= a-b;
1574     a= a+b;
1575     b= c-d;
1576     c= c+d;
1577
1578     block[stride*0 + xStride*0]= (a+c);
1579     block[stride*0 + xStride*1]= (e+b);
1580     block[stride*1 + xStride*0]= (a-c);
1581     block[stride*1 + xStride*1]= (e-b);
1582 }
1583 #endif
1584
1585 /**
1586  * gets the chroma qp.
1587  */
1588 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1589     return h->pps.chroma_qp_table[t][qscale];
1590 }
1591
1592 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1593                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1594                            int src_x_offset, int src_y_offset,
1595                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1596     MpegEncContext * const s = &h->s;
1597     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1598     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1599     const int luma_xy= (mx&3) + ((my&3)<<2);
1600     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1601     uint8_t * src_cb, * src_cr;
1602     int extra_width= h->emu_edge_width;
1603     int extra_height= h->emu_edge_height;
1604     int emu=0;
1605     const int full_mx= mx>>2;
1606     const int full_my= my>>2;
1607     const int pic_width  = 16*s->mb_width;
1608     const int pic_height = 16*s->mb_height >> MB_FIELD;
1609
1610     if(mx&7) extra_width -= 3;
1611     if(my&7) extra_height -= 3;
1612
1613     if(   full_mx < 0-extra_width
1614        || full_my < 0-extra_height
1615        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1616        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1617         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1618             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1619         emu=1;
1620     }
1621
1622     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1623     if(!square){
1624         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1625     }
1626
1627     if(ENABLE_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1628
1629     if(MB_FIELD){
1630         // chroma offset when predicting from a field of opposite parity
1631         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1632         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1633     }
1634     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1635     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1636
1637     if(emu){
1638         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1639             src_cb= s->edge_emu_buffer;
1640     }
1641     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1642
1643     if(emu){
1644         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1645             src_cr= s->edge_emu_buffer;
1646     }
1647     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1648 }
1649
1650 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1651                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1652                            int x_offset, int y_offset,
1653                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1654                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1655                            int list0, int list1){
1656     MpegEncContext * const s = &h->s;
1657     qpel_mc_func *qpix_op=  qpix_put;
1658     h264_chroma_mc_func chroma_op= chroma_put;
1659
1660     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1661     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1662     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1663     x_offset += 8*s->mb_x;
1664     y_offset += 8*(s->mb_y >> MB_FIELD);
1665
1666     if(list0){
1667         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1668         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1669                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1670                            qpix_op, chroma_op);
1671
1672         qpix_op=  qpix_avg;
1673         chroma_op= chroma_avg;
1674     }
1675
1676     if(list1){
1677         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1678         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1679                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1680                            qpix_op, chroma_op);
1681     }
1682 }
1683
1684 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1685                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1686                            int x_offset, int y_offset,
1687                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1688                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1689                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1690                            int list0, int list1){
1691     MpegEncContext * const s = &h->s;
1692
1693     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1694     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1695     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1696     x_offset += 8*s->mb_x;
1697     y_offset += 8*(s->mb_y >> MB_FIELD);
1698
1699     if(list0 && list1){
1700         /* don't optimize for luma-only case, since B-frames usually
1701          * use implicit weights => chroma too. */
1702         uint8_t *tmp_cb = s->obmc_scratchpad;
1703         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1704         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1705         int refn0 = h->ref_cache[0][ scan8[n] ];
1706         int refn1 = h->ref_cache[1][ scan8[n] ];
1707
1708         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1709                     dest_y, dest_cb, dest_cr,
1710                     x_offset, y_offset, qpix_put, chroma_put);
1711         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1712                     tmp_y, tmp_cb, tmp_cr,
1713                     x_offset, y_offset, qpix_put, chroma_put);
1714
1715         if(h->use_weight == 2){
1716             int weight0 = h->implicit_weight[refn0][refn1];
1717             int weight1 = 64 - weight0;
1718             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1719             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1720             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1721         }else{
1722             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1723                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1724                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1725             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1726                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1727                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1728             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1729                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1730                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1731         }
1732     }else{
1733         int list = list1 ? 1 : 0;
1734         int refn = h->ref_cache[list][ scan8[n] ];
1735         Picture *ref= &h->ref_list[list][refn];
1736         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1737                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1738                     qpix_put, chroma_put);
1739
1740         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1741                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1742         if(h->use_weight_chroma){
1743             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1744                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1745             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1746                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1747         }
1748     }
1749 }
1750
1751 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1752                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1753                            int x_offset, int y_offset,
1754                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1755                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1756                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1757                            int list0, int list1){
1758     if((h->use_weight==2 && list0 && list1
1759         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1760        || h->use_weight==1)
1761         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1762                          x_offset, y_offset, qpix_put, chroma_put,
1763                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1764     else
1765         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1766                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1767 }
1768
1769 static inline void prefetch_motion(H264Context *h, int list){
1770     /* fetch pixels for estimated mv 4 macroblocks ahead
1771      * optimized for 64byte cache lines */
1772     MpegEncContext * const s = &h->s;
1773     const int refn = h->ref_cache[list][scan8[0]];
1774     if(refn >= 0){
1775         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1776         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1777         uint8_t **src= h->ref_list[list][refn].data;
1778         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1779         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1780         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1781         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1782     }
1783 }
1784
1785 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1786                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1787                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1788                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1789     MpegEncContext * const s = &h->s;
1790     const int mb_xy= h->mb_xy;
1791     const int mb_type= s->current_picture.mb_type[mb_xy];
1792
1793     assert(IS_INTER(mb_type));
1794
1795     prefetch_motion(h, 0);
1796
1797     if(IS_16X16(mb_type)){
1798         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1799                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1800                 &weight_op[0], &weight_avg[0],
1801                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1802     }else if(IS_16X8(mb_type)){
1803         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1804                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1805                 &weight_op[1], &weight_avg[1],
1806                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1807         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1808                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1809                 &weight_op[1], &weight_avg[1],
1810                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1811     }else if(IS_8X16(mb_type)){
1812         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1813                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1814                 &weight_op[2], &weight_avg[2],
1815                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1816         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1817                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1818                 &weight_op[2], &weight_avg[2],
1819                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1820     }else{
1821         int i;
1822
1823         assert(IS_8X8(mb_type));
1824
1825         for(i=0; i<4; i++){
1826             const int sub_mb_type= h->sub_mb_type[i];
1827             const int n= 4*i;
1828             int x_offset= (i&1)<<2;
1829             int y_offset= (i&2)<<1;
1830
1831             if(IS_SUB_8X8(sub_mb_type)){
1832                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1833                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1834                     &weight_op[3], &weight_avg[3],
1835                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1836             }else if(IS_SUB_8X4(sub_mb_type)){
1837                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1838                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1839                     &weight_op[4], &weight_avg[4],
1840                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1841                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1842                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1843                     &weight_op[4], &weight_avg[4],
1844                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1845             }else if(IS_SUB_4X8(sub_mb_type)){
1846                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1847                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1848                     &weight_op[5], &weight_avg[5],
1849                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1850                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1851                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1852                     &weight_op[5], &weight_avg[5],
1853                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1854             }else{
1855                 int j;
1856                 assert(IS_SUB_4X4(sub_mb_type));
1857                 for(j=0; j<4; j++){
1858                     int sub_x_offset= x_offset + 2*(j&1);
1859                     int sub_y_offset= y_offset +   (j&2);
1860                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1861                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1862                         &weight_op[6], &weight_avg[6],
1863                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1864                 }
1865             }
1866         }
1867     }
1868
1869     prefetch_motion(h, 1);
1870 }
1871
1872 static av_cold void decode_init_vlc(void){
1873     static int done = 0;
1874
1875     if (!done) {
1876         int i;
1877         int offset;
1878         done = 1;
1879
1880         chroma_dc_coeff_token_vlc.table = chroma_dc_coeff_token_vlc_table;
1881         chroma_dc_coeff_token_vlc.table_allocated = chroma_dc_coeff_token_vlc_table_size;
1882         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1883                  &chroma_dc_coeff_token_len [0], 1, 1,
1884                  &chroma_dc_coeff_token_bits[0], 1, 1,
1885                  INIT_VLC_USE_NEW_STATIC);
1886
1887         offset = 0;
1888         for(i=0; i<4; i++){
1889             coeff_token_vlc[i].table = coeff_token_vlc_tables+offset;
1890             coeff_token_vlc[i].table_allocated = coeff_token_vlc_tables_size[i];
1891             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1892                      &coeff_token_len [i][0], 1, 1,
1893                      &coeff_token_bits[i][0], 1, 1,
1894                      INIT_VLC_USE_NEW_STATIC);
1895             offset += coeff_token_vlc_tables_size[i];
1896         }
1897         /*
1898          * This is a one time safety check to make sure that
1899          * the packed static coeff_token_vlc table sizes
1900          * were initialized correctly.
1901          */
1902         assert(offset == FF_ARRAY_ELEMS(coeff_token_vlc_tables));
1903
1904         for(i=0; i<3; i++){
1905             chroma_dc_total_zeros_vlc[i].table = chroma_dc_total_zeros_vlc_tables[i];
1906             chroma_dc_total_zeros_vlc[i].table_allocated = chroma_dc_total_zeros_vlc_tables_size;
1907             init_vlc(&chroma_dc_total_zeros_vlc[i],
1908                      CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1909                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1910                      &chroma_dc_total_zeros_bits[i][0], 1, 1,
1911                      INIT_VLC_USE_NEW_STATIC);
1912         }
1913         for(i=0; i<15; i++){
1914             total_zeros_vlc[i].table = total_zeros_vlc_tables[i];
1915             total_zeros_vlc[i].table_allocated = total_zeros_vlc_tables_size;
1916             init_vlc(&total_zeros_vlc[i],
1917                      TOTAL_ZEROS_VLC_BITS, 16,
1918                      &total_zeros_len [i][0], 1, 1,
1919                      &total_zeros_bits[i][0], 1, 1,
1920                      INIT_VLC_USE_NEW_STATIC);
1921         }
1922
1923         for(i=0; i<6; i++){
1924             run_vlc[i].table = run_vlc_tables[i];
1925             run_vlc[i].table_allocated = run_vlc_tables_size;
1926             init_vlc(&run_vlc[i],
1927                      RUN_VLC_BITS, 7,
1928                      &run_len [i][0], 1, 1,
1929                      &run_bits[i][0], 1, 1,
1930                      INIT_VLC_USE_NEW_STATIC);
1931         }
1932         run7_vlc.table = run7_vlc_table,
1933         run7_vlc.table_allocated = run7_vlc_table_size;
1934         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
1935                  &run_len [6][0], 1, 1,
1936                  &run_bits[6][0], 1, 1,
1937                  INIT_VLC_USE_NEW_STATIC);
1938     }
1939 }
1940
1941 static void free_tables(H264Context *h){
1942     int i;
1943     H264Context *hx;
1944     av_freep(&h->intra4x4_pred_mode);
1945     av_freep(&h->chroma_pred_mode_table);
1946     av_freep(&h->cbp_table);
1947     av_freep(&h->mvd_table[0]);
1948     av_freep(&h->mvd_table[1]);
1949     av_freep(&h->direct_table);
1950     av_freep(&h->non_zero_count);
1951     av_freep(&h->slice_table_base);
1952     h->slice_table= NULL;
1953
1954     av_freep(&h->mb2b_xy);
1955     av_freep(&h->mb2b8_xy);
1956
1957     for(i = 0; i < h->s.avctx->thread_count; i++) {
1958         hx = h->thread_context[i];
1959         if(!hx) continue;
1960         av_freep(&hx->top_borders[1]);
1961         av_freep(&hx->top_borders[0]);
1962         av_freep(&hx->s.obmc_scratchpad);
1963     }
1964 }
1965
1966 static void init_dequant8_coeff_table(H264Context *h){
1967     int i,q,x;
1968     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
1969     h->dequant8_coeff[0] = h->dequant8_buffer[0];
1970     h->dequant8_coeff[1] = h->dequant8_buffer[1];
1971
1972     for(i=0; i<2; i++ ){
1973         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
1974             h->dequant8_coeff[1] = h->dequant8_buffer[0];
1975             break;
1976         }
1977
1978         for(q=0; q<52; q++){
1979             int shift = div6[q];
1980             int idx = rem6[q];
1981             for(x=0; x<64; x++)
1982                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
1983                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
1984                     h->pps.scaling_matrix8[i][x]) << shift;
1985         }
1986     }
1987 }
1988
1989 static void init_dequant4_coeff_table(H264Context *h){
1990     int i,j,q,x;
1991     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
1992     for(i=0; i<6; i++ ){
1993         h->dequant4_coeff[i] = h->dequant4_buffer[i];
1994         for(j=0; j<i; j++){
1995             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
1996                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
1997                 break;
1998             }
1999         }
2000         if(j<i)
2001             continue;
2002
2003         for(q=0; q<52; q++){
2004             int shift = div6[q] + 2;
2005             int idx = rem6[q];
2006             for(x=0; x<16; x++)
2007                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2008                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2009                     h->pps.scaling_matrix4[i][x]) << shift;
2010         }
2011     }
2012 }
2013
2014 static void init_dequant_tables(H264Context *h){
2015     int i,x;
2016     init_dequant4_coeff_table(h);
2017     if(h->pps.transform_8x8_mode)
2018         init_dequant8_coeff_table(h);
2019     if(h->sps.transform_bypass){
2020         for(i=0; i<6; i++)
2021             for(x=0; x<16; x++)
2022                 h->dequant4_coeff[i][0][x] = 1<<6;
2023         if(h->pps.transform_8x8_mode)
2024             for(i=0; i<2; i++)
2025                 for(x=0; x<64; x++)
2026                     h->dequant8_coeff[i][0][x] = 1<<6;
2027     }
2028 }
2029
2030
2031 /**
2032  * allocates tables.
2033  * needs width/height
2034  */
2035 static int alloc_tables(H264Context *h){
2036     MpegEncContext * const s = &h->s;
2037     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2038     int x,y;
2039
2040     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2041
2042     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2043     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(*h->slice_table_base))
2044     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2045
2046     CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2047     CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2048     CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2049     CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2050
2051     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(*h->slice_table_base));
2052     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2053
2054     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2055     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2056     for(y=0; y<s->mb_height; y++){
2057         for(x=0; x<s->mb_width; x++){
2058             const int mb_xy= x + y*s->mb_stride;
2059             const int b_xy = 4*x + 4*y*h->b_stride;
2060             const int b8_xy= 2*x + 2*y*h->b8_stride;
2061
2062             h->mb2b_xy [mb_xy]= b_xy;
2063             h->mb2b8_xy[mb_xy]= b8_xy;
2064         }
2065     }
2066
2067     s->obmc_scratchpad = NULL;
2068
2069     if(!h->dequant4_coeff[0])
2070         init_dequant_tables(h);
2071
2072     return 0;
2073 fail:
2074     free_tables(h);
2075     return -1;
2076 }
2077
2078 /**
2079  * Mimic alloc_tables(), but for every context thread.
2080  */
2081 static void clone_tables(H264Context *dst, H264Context *src){
2082     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2083     dst->non_zero_count           = src->non_zero_count;
2084     dst->slice_table              = src->slice_table;
2085     dst->cbp_table                = src->cbp_table;
2086     dst->mb2b_xy                  = src->mb2b_xy;
2087     dst->mb2b8_xy                 = src->mb2b8_xy;
2088     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2089     dst->mvd_table[0]             = src->mvd_table[0];
2090     dst->mvd_table[1]             = src->mvd_table[1];
2091     dst->direct_table             = src->direct_table;
2092
2093     dst->s.obmc_scratchpad = NULL;
2094     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2095 }
2096
2097 /**
2098  * Init context
2099  * Allocate buffers which are not shared amongst multiple threads.
2100  */
2101 static int context_init(H264Context *h){
2102     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2103     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2104
2105     return 0;
2106 fail:
2107     return -1; // free_tables will clean up for us
2108 }
2109
2110 static av_cold void common_init(H264Context *h){
2111     MpegEncContext * const s = &h->s;
2112
2113     s->width = s->avctx->width;
2114     s->height = s->avctx->height;
2115     s->codec_id= s->avctx->codec->id;
2116
2117     ff_h264_pred_init(&h->hpc, s->codec_id);
2118
2119     h->dequant_coeff_pps= -1;
2120     s->unrestricted_mv=1;
2121     s->decode=1; //FIXME
2122
2123     dsputil_init(&s->dsp, s->avctx); // needed so that idct permutation is known early
2124
2125     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2126     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2127 }
2128
2129 static av_cold int decode_init(AVCodecContext *avctx){
2130     H264Context *h= avctx->priv_data;
2131     MpegEncContext * const s = &h->s;
2132
2133     MPV_decode_defaults(s);
2134
2135     s->avctx = avctx;
2136     common_init(h);
2137
2138     s->out_format = FMT_H264;
2139     s->workaround_bugs= avctx->workaround_bugs;
2140
2141     // set defaults
2142 //    s->decode_mb= ff_h263_decode_mb;
2143     s->quarter_sample = 1;
2144     s->low_delay= 1;
2145
2146     if(avctx->codec_id == CODEC_ID_SVQ3)
2147         avctx->pix_fmt= PIX_FMT_YUVJ420P;
2148     else
2149         avctx->pix_fmt= PIX_FMT_YUV420P;
2150
2151     decode_init_vlc();
2152
2153     if(avctx->extradata_size > 0 && avctx->extradata &&
2154        *(char *)avctx->extradata == 1){
2155         h->is_avc = 1;
2156         h->got_avcC = 0;
2157     } else {
2158         h->is_avc = 0;
2159     }
2160
2161     h->thread_context[0] = h;
2162     h->outputed_poc = INT_MIN;
2163     h->prev_poc_msb= 1<<16;
2164     return 0;
2165 }
2166
2167 static int frame_start(H264Context *h){
2168     MpegEncContext * const s = &h->s;
2169     int i;
2170
2171     if(MPV_frame_start(s, s->avctx) < 0)
2172         return -1;
2173     ff_er_frame_start(s);
2174     /*
2175      * MPV_frame_start uses pict_type to derive key_frame.
2176      * This is incorrect for H.264; IDR markings must be used.
2177      * Zero here; IDR markings per slice in frame or fields are ORed in later.
2178      * See decode_nal_units().
2179      */
2180     s->current_picture_ptr->key_frame= 0;
2181
2182     assert(s->linesize && s->uvlinesize);
2183
2184     for(i=0; i<16; i++){
2185         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2186         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2187     }
2188     for(i=0; i<4; i++){
2189         h->block_offset[16+i]=
2190         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2191         h->block_offset[24+16+i]=
2192         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2193     }
2194
2195     /* can't be in alloc_tables because linesize isn't known there.
2196      * FIXME: redo bipred weight to not require extra buffer? */
2197     for(i = 0; i < s->avctx->thread_count; i++)
2198         if(!h->thread_context[i]->s.obmc_scratchpad)
2199             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2200
2201     /* some macroblocks will be accessed before they're available */
2202     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2203         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(*h->slice_table));
2204
2205 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2206
2207     // We mark the current picture as non-reference after allocating it, so
2208     // that if we break out due to an error it can be released automatically
2209     // in the next MPV_frame_start().
2210     // SVQ3 as well as most other codecs have only last/next/current and thus
2211     // get released even with set reference, besides SVQ3 and others do not
2212     // mark frames as reference later "naturally".
2213     if(s->codec_id != CODEC_ID_SVQ3)
2214         s->current_picture_ptr->reference= 0;
2215
2216     s->current_picture_ptr->field_poc[0]=
2217     s->current_picture_ptr->field_poc[1]= INT_MAX;
2218     assert(s->current_picture_ptr->long_ref==0);
2219
2220     return 0;
2221 }
2222
2223 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2224     MpegEncContext * const s = &h->s;
2225     int i;
2226     int step    = 1;
2227     int offset  = 1;
2228     int uvoffset= 1;
2229     int top_idx = 1;
2230     int skiplast= 0;
2231
2232     src_y  -=   linesize;
2233     src_cb -= uvlinesize;
2234     src_cr -= uvlinesize;
2235
2236     if(!simple && FRAME_MBAFF){
2237         if(s->mb_y&1){
2238             offset  = MB_MBAFF ? 1 : 17;
2239             uvoffset= MB_MBAFF ? 1 : 9;
2240             if(!MB_MBAFF){
2241                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 0)= *(uint64_t*)(src_y +  15*linesize);
2242                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 8)= *(uint64_t*)(src_y +8+15*linesize);
2243                 if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2244                     *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+7*uvlinesize);
2245                     *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+7*uvlinesize);
2246                 }
2247             }
2248         }else{
2249             if(!MB_MBAFF){
2250                 h->left_border[0]= h->top_borders[0][s->mb_x][15];
2251                 if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2252                     h->left_border[34   ]= h->top_borders[0][s->mb_x][16+7  ];
2253                     h->left_border[34+18]= h->top_borders[0][s->mb_x][16+8+7];
2254                 }
2255                 skiplast= 1;
2256             }
2257             offset  =
2258             uvoffset=
2259             top_idx = MB_MBAFF ? 0 : 1;
2260         }
2261         step= MB_MBAFF ? 2 : 1;
2262     }
2263
2264     // There are two lines saved, the line above the the top macroblock of a pair,
2265     // and the line above the bottom macroblock
2266     h->left_border[offset]= h->top_borders[top_idx][s->mb_x][15];
2267     for(i=1; i<17 - skiplast; i++){
2268         h->left_border[offset+i*step]= src_y[15+i*  linesize];
2269     }
2270
2271     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2272     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2273
2274     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2275         h->left_border[uvoffset+34   ]= h->top_borders[top_idx][s->mb_x][16+7];
2276         h->left_border[uvoffset+34+18]= h->top_borders[top_idx][s->mb_x][24+7];
2277         for(i=1; i<9 - skiplast; i++){
2278             h->left_border[uvoffset+34   +i*step]= src_cb[7+i*uvlinesize];
2279             h->left_border[uvoffset+34+18+i*step]= src_cr[7+i*uvlinesize];
2280         }
2281         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2282         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2283     }
2284 }
2285
2286 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2287     MpegEncContext * const s = &h->s;
2288     int temp8, i;
2289     uint64_t temp64;
2290     int deblock_left;
2291     int deblock_top;
2292     int mb_xy;
2293     int step    = 1;
2294     int offset  = 1;
2295     int uvoffset= 1;
2296     int top_idx = 1;
2297
2298     if(!simple && FRAME_MBAFF){
2299         if(s->mb_y&1){
2300             offset  = MB_MBAFF ? 1 : 17;
2301             uvoffset= MB_MBAFF ? 1 : 9;
2302         }else{
2303             offset  =
2304             uvoffset=
2305             top_idx = MB_MBAFF ? 0 : 1;
2306         }
2307         step= MB_MBAFF ? 2 : 1;
2308     }
2309
2310     if(h->deblocking_filter == 2) {
2311         mb_xy = h->mb_xy;
2312         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2313         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2314     } else {
2315         deblock_left = (s->mb_x > 0);
2316         deblock_top =  (s->mb_y > !!MB_FIELD);
2317     }
2318
2319     src_y  -=   linesize + 1;
2320     src_cb -= uvlinesize + 1;
2321     src_cr -= uvlinesize + 1;
2322
2323 #define XCHG(a,b,t,xchg)\
2324 t= a;\
2325 if(xchg)\
2326     a= b;\
2327 b= t;
2328
2329     if(deblock_left){
2330         for(i = !deblock_top; i<16; i++){
2331             XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, xchg);
2332         }
2333         XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, 1);
2334     }
2335
2336     if(deblock_top){
2337         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2338         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2339         if(s->mb_x+1 < s->mb_width){
2340             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2341         }
2342     }
2343
2344     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2345         if(deblock_left){
2346             for(i = !deblock_top; i<8; i++){
2347                 XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, xchg);
2348                 XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, xchg);
2349             }
2350             XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, 1);
2351             XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, 1);
2352         }
2353         if(deblock_top){
2354             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2355             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2356         }
2357     }
2358 }
2359
2360 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2361     MpegEncContext * const s = &h->s;
2362     const int mb_x= s->mb_x;
2363     const int mb_y= s->mb_y;
2364     const int mb_xy= h->mb_xy;
2365     const int mb_type= s->current_picture.mb_type[mb_xy];
2366     uint8_t  *dest_y, *dest_cb, *dest_cr;
2367     int linesize, uvlinesize /*dct_offset*/;
2368     int i;
2369     int *block_offset = &h->block_offset[0];
2370     const int transform_bypass = !simple && (s->qscale == 0 && h->sps.transform_bypass);
2371     const int is_h264 = simple || s->codec_id == CODEC_ID_H264;
2372     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2373     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2374
2375     dest_y  = s->current_picture.data[0] + (mb_x + mb_y * s->linesize  ) * 16;
2376     dest_cb = s->current_picture.data[1] + (mb_x + mb_y * s->uvlinesize) * 8;
2377     dest_cr = s->current_picture.data[2] + (mb_x + mb_y * s->uvlinesize) * 8;
2378
2379     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2380     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2381
2382     if (!simple && MB_FIELD) {
2383         linesize   = h->mb_linesize   = s->linesize * 2;
2384         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2385         block_offset = &h->block_offset[24];
2386         if(mb_y&1){ //FIXME move out of this function?
2387             dest_y -= s->linesize*15;
2388             dest_cb-= s->uvlinesize*7;
2389             dest_cr-= s->uvlinesize*7;
2390         }
2391         if(FRAME_MBAFF) {
2392             int list;
2393             for(list=0; list<h->list_count; list++){
2394                 if(!USES_LIST(mb_type, list))
2395                     continue;
2396                 if(IS_16X16(mb_type)){
2397                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2398                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2399                 }else{
2400                     for(i=0; i<16; i+=4){
2401                         int ref = h->ref_cache[list][scan8[i]];
2402                         if(ref >= 0)
2403                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2404                     }
2405                 }
2406             }
2407         }
2408     } else {
2409         linesize   = h->mb_linesize   = s->linesize;
2410         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2411 //        dct_offset = s->linesize * 16;
2412     }
2413
2414     if (!simple && IS_INTRA_PCM(mb_type)) {
2415         for (i=0; i<16; i++) {
2416             memcpy(dest_y + i*  linesize, h->mb       + i*8, 16);
2417         }
2418         for (i=0; i<8; i++) {
2419             memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
2420             memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
2421         }
2422     } else {
2423         if(IS_INTRA(mb_type)){
2424             if(h->deblocking_filter)
2425                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2426
2427             if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2428                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2429                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2430             }
2431
2432             if(IS_INTRA4x4(mb_type)){
2433                 if(simple || !s->encoding){
2434                     if(IS_8x8DCT(mb_type)){
2435                         if(transform_bypass){
2436                             idct_dc_add =
2437                             idct_add    = s->dsp.add_pixels8;
2438                         }else{
2439                             idct_dc_add = s->dsp.h264_idct8_dc_add;
2440                             idct_add    = s->dsp.h264_idct8_add;
2441                         }
2442                         for(i=0; i<16; i+=4){
2443                             uint8_t * const ptr= dest_y + block_offset[i];
2444                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2445                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2446                                 h->hpc.pred8x8l_add[dir](ptr, h->mb + i*16, linesize);
2447                             }else{
2448                                 const int nnz = h->non_zero_count_cache[ scan8[i] ];
2449                                 h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2450                                                             (h->topright_samples_available<<i)&0x4000, linesize);
2451                                 if(nnz){
2452                                     if(nnz == 1 && h->mb[i*16])
2453                                         idct_dc_add(ptr, h->mb + i*16, linesize);
2454                                     else
2455                                         idct_add   (ptr, h->mb + i*16, linesize);
2456                                 }
2457                             }
2458                         }
2459                     }else{
2460                         if(transform_bypass){
2461                             idct_dc_add =
2462                             idct_add    = s->dsp.add_pixels4;
2463                         }else{
2464                             idct_dc_add = s->dsp.h264_idct_dc_add;
2465                             idct_add    = s->dsp.h264_idct_add;
2466                         }
2467                         for(i=0; i<16; i++){
2468                             uint8_t * const ptr= dest_y + block_offset[i];
2469                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2470
2471                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2472                                 h->hpc.pred4x4_add[dir](ptr, h->mb + i*16, linesize);
2473                             }else{
2474                                 uint8_t *topright;
2475                                 int nnz, tr;
2476                                 if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2477                                     const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2478                                     assert(mb_y || linesize <= block_offset[i]);
2479                                     if(!topright_avail){
2480                                         tr= ptr[3 - linesize]*0x01010101;
2481                                         topright= (uint8_t*) &tr;
2482                                     }else
2483                                         topright= ptr + 4 - linesize;
2484                                 }else
2485                                     topright= NULL;
2486
2487                                 h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2488                                 nnz = h->non_zero_count_cache[ scan8[i] ];
2489                                 if(nnz){
2490                                     if(is_h264){
2491                                         if(nnz == 1 && h->mb[i*16])
2492                                             idct_dc_add(ptr, h->mb + i*16, linesize);
2493                                         else
2494                                             idct_add   (ptr, h->mb + i*16, linesize);
2495                                     }else
2496                                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2497                                 }
2498                             }
2499                         }
2500                     }
2501                 }
2502             }else{
2503                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2504                 if(is_h264){
2505                     if(!transform_bypass)
2506                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2507                 }else
2508                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2509             }
2510             if(h->deblocking_filter)
2511                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2512         }else if(is_h264){
2513             hl_motion(h, dest_y, dest_cb, dest_cr,
2514                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2515                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2516                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2517         }
2518
2519
2520         if(!IS_INTRA4x4(mb_type)){
2521             if(is_h264){
2522                 if(IS_INTRA16x16(mb_type)){
2523                     if(transform_bypass){
2524                         if(h->sps.profile_idc==244 && (h->intra16x16_pred_mode==VERT_PRED8x8 || h->intra16x16_pred_mode==HOR_PRED8x8)){
2525                             h->hpc.pred16x16_add[h->intra16x16_pred_mode](dest_y, block_offset, h->mb, linesize);
2526                         }else{
2527                             for(i=0; i<16; i++){
2528                                 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2529                                     s->dsp.add_pixels4(dest_y + block_offset[i], h->mb + i*16, linesize);
2530                             }
2531                         }
2532                     }else{
2533                          s->dsp.h264_idct_add16intra(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2534                     }
2535                 }else if(h->cbp&15){
2536                     if(transform_bypass){
2537                         const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2538                         idct_add= IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2539                         for(i=0; i<16; i+=di){
2540                             if(h->non_zero_count_cache[ scan8[i] ]){
2541                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2542                             }
2543                         }
2544                     }else{
2545                         if(IS_8x8DCT(mb_type)){
2546                             s->dsp.h264_idct8_add4(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2547                         }else{
2548                             s->dsp.h264_idct_add16(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2549                         }
2550                     }
2551                 }
2552             }else{
2553                 for(i=0; i<16; i++){
2554                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2555                         uint8_t * const ptr= dest_y + block_offset[i];
2556                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2557                     }
2558                 }
2559             }
2560         }
2561
2562         if((simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)) && (h->cbp&0x30)){
2563             uint8_t *dest[2] = {dest_cb, dest_cr};
2564             if(transform_bypass){
2565                 if(IS_INTRA(mb_type) && h->sps.profile_idc==244 && (h->chroma_pred_mode==VERT_PRED8x8 || h->chroma_pred_mode==HOR_PRED8x8)){
2566                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0], block_offset + 16, h->mb + 16*16, uvlinesize);
2567                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1], block_offset + 20, h->mb + 20*16, uvlinesize);
2568                 }else{
2569                     idct_add = s->dsp.add_pixels4;
2570                     for(i=16; i<16+8; i++){
2571                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2572                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2573                     }
2574                 }
2575             }else{
2576                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2577                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2578                 if(is_h264){
2579                     idct_add = s->dsp.h264_idct_add;
2580                     idct_dc_add = s->dsp.h264_idct_dc_add;
2581                     for(i=16; i<16+8; i++){
2582                         if(h->non_zero_count_cache[ scan8[i] ])
2583                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2584                         else if(h->mb[i*16])
2585                             idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2586                     }
2587                 }else{
2588                     for(i=16; i<16+8; i++){
2589                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2590                             uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2591                             svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2592                         }
2593                     }
2594                 }
2595             }
2596         }
2597     }
2598     if(h->deblocking_filter) {
2599         backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2600         fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2601         h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2602         h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2603         if (!simple && FRAME_MBAFF) {
2604             filter_mb     (h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2605         } else {
2606             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2607         }
2608     }
2609 }
2610
2611 /**
2612  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2613  */
2614 static void hl_decode_mb_simple(H264Context *h){
2615     hl_decode_mb_internal(h, 1);
2616 }
2617
2618 /**
2619  * Process a macroblock; this handles edge cases, such as interlacing.
2620  */
2621 static void av_noinline hl_decode_mb_complex(H264Context *h){
2622     hl_decode_mb_internal(h, 0);
2623 }
2624
2625 static void hl_decode_mb(H264Context *h){
2626     MpegEncContext * const s = &h->s;
2627     const int mb_xy= h->mb_xy;
2628     const int mb_type= s->current_picture.mb_type[mb_xy];
2629     int is_complex = ENABLE_SMALL || h->is_complex || IS_INTRA_PCM(mb_type) || s->qscale == 0;
2630
2631     if(ENABLE_H264_ENCODER && !s->decode)
2632         return;
2633
2634     if (is_complex)
2635         hl_decode_mb_complex(h);
2636     else hl_decode_mb_simple(h);
2637 }
2638
2639 static void pic_as_field(Picture *pic, const int parity){
2640     int i;
2641     for (i = 0; i < 4; ++i) {
2642         if (parity == PICT_BOTTOM_FIELD)
2643             pic->data[i] += pic->linesize[i];
2644         pic->reference = parity;
2645         pic->linesize[i] *= 2;
2646     }
2647     pic->poc= pic->field_poc[parity == PICT_BOTTOM_FIELD];
2648 }
2649
2650 static int split_field_copy(Picture *dest, Picture *src,
2651                             int parity, int id_add){
2652     int match = !!(src->reference & parity);
2653
2654     if (match) {
2655         *dest = *src;
2656         if(parity != PICT_FRAME){
2657             pic_as_field(dest, parity);
2658             dest->pic_id *= 2;
2659             dest->pic_id += id_add;
2660         }
2661     }
2662
2663     return match;
2664 }
2665
2666 static int build_def_list(Picture *def, Picture **in, int len, int is_long, int sel){
2667     int i[2]={0};
2668     int index=0;
2669
2670     while(i[0]<len || i[1]<len){
2671         while(i[0]<len && !(in[ i[0] ] && (in[ i[0] ]->reference & sel)))
2672             i[0]++;
2673         while(i[1]<len && !(in[ i[1] ] && (in[ i[1] ]->reference & (sel^3))))
2674             i[1]++;
2675         if(i[0] < len){
2676             in[ i[0] ]->pic_id= is_long ? i[0] : in[ i[0] ]->frame_num;
2677             split_field_copy(&def[index++], in[ i[0]++ ], sel  , 1);
2678         }
2679         if(i[1] < len){
2680             in[ i[1] ]->pic_id= is_long ? i[1] : in[ i[1] ]->frame_num;
2681             split_field_copy(&def[index++], in[ i[1]++ ], sel^3, 0);
2682         }
2683     }
2684
2685     return index;
2686 }
2687
2688 static int add_sorted(Picture **sorted, Picture **src, int len, int limit, int dir){
2689     int i, best_poc;
2690     int out_i= 0;
2691
2692     for(;;){
2693         best_poc= dir ? INT_MIN : INT_MAX;
2694
2695         for(i=0; i<len; i++){
2696             const int poc= src[i]->poc;
2697             if(((poc > limit) ^ dir) && ((poc < best_poc) ^ dir)){
2698                 best_poc= poc;
2699                 sorted[out_i]= src[i];
2700             }
2701         }
2702         if(best_poc == (dir ? INT_MIN : INT_MAX))
2703             break;
2704         limit= sorted[out_i++]->poc - dir;
2705     }
2706     return out_i;
2707 }
2708
2709 /**
2710  * fills the default_ref_list.
2711  */
2712 static int fill_default_ref_list(H264Context *h){
2713     MpegEncContext * const s = &h->s;
2714     int i, len;
2715
2716     if(h->slice_type_nos==FF_B_TYPE){
2717         Picture *sorted[32];
2718         int cur_poc, list;
2719         int lens[2];
2720
2721         if(FIELD_PICTURE)
2722             cur_poc= s->current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
2723         else
2724             cur_poc= s->current_picture_ptr->poc;
2725
2726         for(list= 0; list<2; list++){
2727             len= add_sorted(sorted    , h->short_ref, h->short_ref_count, cur_poc, 1^list);
2728             len+=add_sorted(sorted+len, h->short_ref, h->short_ref_count, cur_poc, 0^list);
2729             assert(len<=32);
2730             len= build_def_list(h->default_ref_list[list]    , sorted     , len, 0, s->picture_structure);
2731             len+=build_def_list(h->default_ref_list[list]+len, h->long_ref, 16 , 1, s->picture_structure);
2732             assert(len<=32);
2733
2734             if(len < h->ref_count[list])
2735                 memset(&h->default_ref_list[list][len], 0, sizeof(Picture)*(h->ref_count[list] - len));
2736             lens[list]= len;
2737         }
2738
2739         if(lens[0] == lens[1] && lens[1] > 1){
2740             for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0] && i<lens[0]; i++);
2741             if(i == lens[0])
2742                 FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2743         }
2744     }else{
2745         len = build_def_list(h->default_ref_list[0]    , h->short_ref, h->short_ref_count, 0, s->picture_structure);
2746         len+= build_def_list(h->default_ref_list[0]+len, h-> long_ref, 16                , 1, s->picture_structure);
2747         assert(len <= 32);
2748         if(len < h->ref_count[0])
2749             memset(&h->default_ref_list[0][len], 0, sizeof(Picture)*(h->ref_count[0] - len));
2750     }
2751 #ifdef TRACE
2752     for (i=0; i<h->ref_count[0]; i++) {
2753         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2754     }
2755     if(h->slice_type_nos==FF_B_TYPE){
2756         for (i=0; i<h->ref_count[1]; i++) {
2757             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
2758         }
2759     }
2760 #endif
2761     return 0;
2762 }
2763
2764 static void print_short_term(H264Context *h);
2765 static void print_long_term(H264Context *h);
2766
2767 /**
2768  * Extract structure information about the picture described by pic_num in
2769  * the current decoding context (frame or field). Note that pic_num is
2770  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
2771  * @param pic_num picture number for which to extract structure information
2772  * @param structure one of PICT_XXX describing structure of picture
2773  *                      with pic_num
2774  * @return frame number (short term) or long term index of picture
2775  *         described by pic_num
2776  */
2777 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
2778     MpegEncContext * const s = &h->s;
2779
2780     *structure = s->picture_structure;
2781     if(FIELD_PICTURE){
2782         if (!(pic_num & 1))
2783             /* opposite field */
2784             *structure ^= PICT_FRAME;
2785         pic_num >>= 1;
2786     }
2787
2788     return pic_num;
2789 }
2790
2791 static int decode_ref_pic_list_reordering(H264Context *h){
2792     MpegEncContext * const s = &h->s;
2793     int list, index, pic_structure;
2794
2795     print_short_term(h);
2796     print_long_term(h);
2797
2798     for(list=0; list<h->list_count; list++){
2799         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
2800
2801         if(get_bits1(&s->gb)){
2802             int pred= h->curr_pic_num;
2803
2804             for(index=0; ; index++){
2805                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
2806                 unsigned int pic_id;
2807                 int i;
2808                 Picture *ref = NULL;
2809
2810                 if(reordering_of_pic_nums_idc==3)
2811                     break;
2812
2813                 if(index >= h->ref_count[list]){
2814                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
2815                     return -1;
2816                 }
2817
2818                 if(reordering_of_pic_nums_idc<3){
2819                     if(reordering_of_pic_nums_idc<2){
2820                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
2821                         int frame_num;
2822
2823                         if(abs_diff_pic_num > h->max_pic_num){
2824                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
2825                             return -1;
2826                         }
2827
2828                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
2829                         else                                pred+= abs_diff_pic_num;
2830                         pred &= h->max_pic_num - 1;
2831
2832                         frame_num = pic_num_extract(h, pred, &pic_structure);
2833
2834                         for(i= h->short_ref_count-1; i>=0; i--){
2835                             ref = h->short_ref[i];
2836                             assert(ref->reference);
2837                             assert(!ref->long_ref);
2838                             if(
2839                                    ref->frame_num == frame_num &&
2840                                    (ref->reference & pic_structure)
2841                               )
2842                                 break;
2843                         }
2844                         if(i>=0)
2845                             ref->pic_id= pred;
2846                     }else{
2847                         int long_idx;
2848                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
2849
2850                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
2851
2852                         if(long_idx>31){
2853                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
2854                             return -1;
2855                         }
2856                         ref = h->long_ref[long_idx];
2857                         assert(!(ref && !ref->reference));
2858                         if(ref && (ref->reference & pic_structure)){
2859                             ref->pic_id= pic_id;
2860                             assert(ref->long_ref);
2861                             i=0;
2862                         }else{
2863                             i=-1;
2864                         }
2865                     }
2866
2867                     if (i < 0) {
2868                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
2869                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
2870                     } else {
2871                         for(i=index; i+1<h->ref_count[list]; i++){
2872                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
2873                                 break;
2874                         }
2875                         for(; i > index; i--){
2876                             h->ref_list[list][i]= h->ref_list[list][i-1];
2877                         }
2878                         h->ref_list[list][index]= *ref;
2879                         if (FIELD_PICTURE){
2880                             pic_as_field(&h->ref_list[list][index], pic_structure);
2881                         }
2882                     }
2883                 }else{
2884                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
2885                     return -1;
2886                 }
2887             }
2888         }
2889     }
2890     for(list=0; list<h->list_count; list++){
2891         for(index= 0; index < h->ref_count[list]; index++){
2892             if(!h->ref_list[list][index].data[0]){
2893                 av_log(h->s.avctx, AV_LOG_ERROR, "Missing reference picture\n");
2894                 h->ref_list[list][index]= s->current_picture; //FIXME this is not a sensible solution
2895             }
2896         }
2897     }
2898
2899     return 0;
2900 }
2901
2902 static void fill_mbaff_ref_list(H264Context *h){
2903     int list, i, j;
2904     for(list=0; list<2; list++){ //FIXME try list_count
2905         for(i=0; i<h->ref_count[list]; i++){
2906             Picture *frame = &h->ref_list[list][i];
2907             Picture *field = &h->ref_list[list][16+2*i];
2908             field[0] = *frame;
2909             for(j=0; j<3; j++)
2910                 field[0].linesize[j] <<= 1;
2911             field[0].reference = PICT_TOP_FIELD;
2912             field[0].poc= field[0].field_poc[0];
2913             field[1] = field[0];
2914             for(j=0; j<3; j++)
2915                 field[1].data[j] += frame->linesize[j];
2916             field[1].reference = PICT_BOTTOM_FIELD;
2917             field[1].poc= field[1].field_poc[1];
2918
2919             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
2920             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
2921             for(j=0; j<2; j++){
2922                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
2923                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
2924             }
2925         }
2926     }
2927     for(j=0; j<h->ref_count[1]; j++){
2928         for(i=0; i<h->ref_count[0]; i++)
2929             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
2930         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
2931         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
2932     }
2933 }
2934
2935 static int pred_weight_table(H264Context *h){
2936     MpegEncContext * const s = &h->s;
2937     int list, i;
2938     int luma_def, chroma_def;
2939
2940     h->use_weight= 0;
2941     h->use_weight_chroma= 0;
2942     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
2943     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
2944     luma_def = 1<<h->luma_log2_weight_denom;
2945     chroma_def = 1<<h->chroma_log2_weight_denom;
2946
2947     for(list=0; list<2; list++){
2948         for(i=0; i<h->ref_count[list]; i++){
2949             int luma_weight_flag, chroma_weight_flag;
2950
2951             luma_weight_flag= get_bits1(&s->gb);
2952             if(luma_weight_flag){
2953                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
2954                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
2955                 if(   h->luma_weight[list][i] != luma_def
2956                    || h->luma_offset[list][i] != 0)
2957                     h->use_weight= 1;
2958             }else{
2959                 h->luma_weight[list][i]= luma_def;
2960                 h->luma_offset[list][i]= 0;
2961             }
2962
2963             if(CHROMA){
2964                 chroma_weight_flag= get_bits1(&s->gb);
2965                 if(chroma_weight_flag){
2966                     int j;
2967                     for(j=0; j<2; j++){
2968                         h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
2969                         h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
2970                         if(   h->chroma_weight[list][i][j] != chroma_def
2971                         || h->chroma_offset[list][i][j] != 0)
2972                             h->use_weight_chroma= 1;
2973                     }
2974                 }else{
2975                     int j;
2976                     for(j=0; j<2; j++){
2977                         h->chroma_weight[list][i][j]= chroma_def;
2978                         h->chroma_offset[list][i][j]= 0;
2979                     }
2980                 }
2981             }
2982         }
2983         if(h->slice_type_nos != FF_B_TYPE) break;
2984     }
2985     h->use_weight= h->use_weight || h->use_weight_chroma;
2986     return 0;
2987 }
2988
2989 static void implicit_weight_table(H264Context *h){
2990     MpegEncContext * const s = &h->s;
2991     int ref0, ref1;
2992     int cur_poc = s->current_picture_ptr->poc;
2993
2994     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
2995        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
2996         h->use_weight= 0;
2997         h->use_weight_chroma= 0;
2998         return;
2999     }
3000
3001     h->use_weight= 2;
3002     h->use_weight_chroma= 2;
3003     h->luma_log2_weight_denom= 5;
3004     h->chroma_log2_weight_denom= 5;
3005
3006     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3007         int poc0 = h->ref_list[0][ref0].poc;
3008         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3009             int poc1 = h->ref_list[1][ref1].poc;
3010             int td = av_clip(poc1 - poc0, -128, 127);
3011             if(td){
3012                 int tb = av_clip(cur_poc - poc0, -128, 127);
3013                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3014                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3015                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3016                     h->implicit_weight[ref0][ref1] = 32;
3017                 else
3018                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3019             }else
3020                 h->implicit_weight[ref0][ref1] = 32;
3021         }
3022     }
3023 }
3024
3025 /**
3026  * Mark a picture as no longer needed for reference. The refmask
3027  * argument allows unreferencing of individual fields or the whole frame.
3028  * If the picture becomes entirely unreferenced, but is being held for
3029  * display purposes, it is marked as such.
3030  * @param refmask mask of fields to unreference; the mask is bitwise
3031  *                anded with the reference marking of pic
3032  * @return non-zero if pic becomes entirely unreferenced (except possibly
3033  *         for display purposes) zero if one of the fields remains in
3034  *         reference
3035  */
3036 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3037     int i;
3038     if (pic->reference &= refmask) {
3039         return 0;
3040     } else {
3041         for(i = 0; h->delayed_pic[i]; i++)
3042             if(pic == h->delayed_pic[i]){
3043                 pic->reference=DELAYED_PIC_REF;
3044                 break;
3045             }
3046         return 1;
3047     }
3048 }
3049
3050 /**
3051  * instantaneous decoder refresh.
3052  */
3053 static void idr(H264Context *h){
3054     int i;
3055
3056     for(i=0; i<16; i++){
3057         remove_long(h, i, 0);
3058     }
3059     assert(h->long_ref_count==0);
3060
3061     for(i=0; i<h->short_ref_count; i++){
3062         unreference_pic(h, h->short_ref[i], 0);
3063         h->short_ref[i]= NULL;
3064     }
3065     h->short_ref_count=0;
3066     h->prev_frame_num= 0;
3067     h->prev_frame_num_offset= 0;
3068     h->prev_poc_msb=
3069     h->prev_poc_lsb= 0;
3070 }
3071
3072 /* forget old pics after a seek */
3073 static void flush_dpb(AVCodecContext *avctx){
3074     H264Context *h= avctx->priv_data;
3075     int i;
3076     for(i=0; i<MAX_DELAYED_PIC_COUNT; i++) {
3077         if(h->delayed_pic[i])
3078             h->delayed_pic[i]->reference= 0;
3079         h->delayed_pic[i]= NULL;
3080     }
3081     h->outputed_poc= INT_MIN;
3082     idr(h);
3083     if(h->s.current_picture_ptr)
3084         h->s.current_picture_ptr->reference= 0;
3085     h->s.first_field= 0;
3086     ff_mpeg_flush(avctx);
3087 }
3088
3089 /**
3090  * Find a Picture in the short term reference list by frame number.
3091  * @param frame_num frame number to search for
3092  * @param idx the index into h->short_ref where returned picture is found
3093  *            undefined if no picture found.
3094  * @return pointer to the found picture, or NULL if no pic with the provided
3095  *                 frame number is found
3096  */
3097 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3098     MpegEncContext * const s = &h->s;
3099     int i;
3100
3101     for(i=0; i<h->short_ref_count; i++){
3102         Picture *pic= h->short_ref[i];
3103         if(s->avctx->debug&FF_DEBUG_MMCO)
3104             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3105         if(pic->frame_num == frame_num) {
3106             *idx = i;
3107             return pic;
3108         }
3109     }
3110     return NULL;
3111 }
3112
3113 /**
3114  * Remove a picture from the short term reference list by its index in
3115  * that list.  This does no checking on the provided index; it is assumed
3116  * to be valid. Other list entries are shifted down.
3117  * @param i index into h->short_ref of picture to remove.
3118  */
3119 static void remove_short_at_index(H264Context *h, int i){
3120     assert(i >= 0 && i < h->short_ref_count);
3121     h->short_ref[i]= NULL;
3122     if (--h->short_ref_count)
3123         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3124 }
3125
3126 /**
3127  *
3128  * @return the removed picture or NULL if an error occurs
3129  */
3130 static Picture * remove_short(H264Context *h, int frame_num, int ref_mask){
3131     MpegEncContext * const s = &h->s;
3132     Picture *pic;
3133     int i;
3134
3135     if(s->avctx->debug&FF_DEBUG_MMCO)
3136         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3137
3138     pic = find_short(h, frame_num, &i);
3139     if (pic){
3140         if(unreference_pic(h, pic, ref_mask))
3141         remove_short_at_index(h, i);
3142     }
3143
3144     return pic;
3145 }
3146
3147 /**
3148  * Remove a picture from the long term reference list by its index in
3149  * that list.
3150  * @return the removed picture or NULL if an error occurs
3151  */
3152 static Picture * remove_long(H264Context *h, int i, int ref_mask){
3153     Picture *pic;
3154
3155     pic= h->long_ref[i];
3156     if (pic){
3157         if(unreference_pic(h, pic, ref_mask)){
3158             assert(h->long_ref[i]->long_ref == 1);
3159             h->long_ref[i]->long_ref= 0;
3160             h->long_ref[i]= NULL;
3161             h->long_ref_count--;
3162         }
3163     }
3164
3165     return pic;
3166 }
3167
3168 /**
3169  * print short term list
3170  */
3171 static void print_short_term(H264Context *h) {
3172     uint32_t i;
3173     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3174         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3175         for(i=0; i<h->short_ref_count; i++){
3176             Picture *pic= h->short_ref[i];
3177             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3178         }
3179     }
3180 }
3181
3182 /**
3183  * print long term list
3184  */
3185 static void print_long_term(H264Context *h) {
3186     uint32_t i;
3187     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3188         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3189         for(i = 0; i < 16; i++){
3190             Picture *pic= h->long_ref[i];
3191             if (pic) {
3192                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3193             }
3194         }
3195     }
3196 }
3197
3198 /**
3199  * Executes the reference picture marking (memory management control operations).
3200  */
3201 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3202     MpegEncContext * const s = &h->s;
3203     int i, j;
3204     int current_ref_assigned=0;
3205     Picture *pic;
3206
3207     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3208         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3209
3210     for(i=0; i<mmco_count; i++){
3211         int structure, frame_num;
3212         if(s->avctx->debug&FF_DEBUG_MMCO)
3213             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3214
3215         if(   mmco[i].opcode == MMCO_SHORT2UNUSED
3216            || mmco[i].opcode == MMCO_SHORT2LONG){
3217             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3218             pic = find_short(h, frame_num, &j);
3219             if(!pic){
3220                 if(mmco[i].opcode != MMCO_SHORT2LONG || !h->long_ref[mmco[i].long_arg]
3221                    || h->long_ref[mmco[i].long_arg]->frame_num != frame_num)
3222                 av_log(h->s.avctx, AV_LOG_ERROR, "mmco: unref short failure\n");
3223                 continue;
3224             }
3225         }
3226
3227         switch(mmco[i].opcode){
3228         case MMCO_SHORT2UNUSED:
3229             if(s->avctx->debug&FF_DEBUG_MMCO)
3230                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3231             remove_short(h, frame_num, structure ^ PICT_FRAME);
3232             break;
3233         case MMCO_SHORT2LONG:
3234                 if (h->long_ref[mmco[i].long_arg] != pic)
3235                     remove_long(h, mmco[i].long_arg, 0);
3236
3237                 remove_short_at_index(h, j);
3238                 h->long_ref[ mmco[i].long_arg ]= pic;
3239                 if (h->long_ref[ mmco[i].long_arg ]){
3240                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3241                     h->long_ref_count++;
3242                 }
3243             break;
3244         case MMCO_LONG2UNUSED:
3245             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3246             pic = h->long_ref[j];
3247             if (pic) {
3248                 remove_long(h, j, structure ^ PICT_FRAME);
3249             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3250                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3251             break;
3252         case MMCO_LONG:
3253                     // Comment below left from previous code as it is an interresting note.
3254                     /* First field in pair is in short term list or
3255                      * at a different long term index.
3256                      * This is not allowed; see 7.4.3.3, notes 2 and 3.
3257                      * Report the problem and keep the pair where it is,
3258                      * and mark this field valid.
3259                      */
3260
3261             if (h->long_ref[mmco[i].long_arg] != s->current_picture_ptr) {
3262                 remove_long(h, mmco[i].long_arg, 0);
3263
3264                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3265                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3266                 h->long_ref_count++;
3267             }
3268
3269             s->current_picture_ptr->reference |= s->picture_structure;
3270             current_ref_assigned=1;
3271             break;
3272         case MMCO_SET_MAX_LONG:
3273             assert(mmco[i].long_arg <= 16);
3274             // just remove the long term which index is greater than new max
3275             for(j = mmco[i].long_arg; j<16; j++){
3276                 remove_long(h, j, 0);
3277             }
3278             break;
3279         case MMCO_RESET:
3280             while(h->short_ref_count){
3281                 remove_short(h, h->short_ref[0]->frame_num, 0);
3282             }
3283             for(j = 0; j < 16; j++) {
3284                 remove_long(h, j, 0);
3285             }
3286             s->current_picture_ptr->poc=
3287             s->current_picture_ptr->field_poc[0]=
3288             s->current_picture_ptr->field_poc[1]=
3289             h->poc_lsb=
3290             h->poc_msb=
3291             h->frame_num=
3292             s->current_picture_ptr->frame_num= 0;
3293             break;
3294         default: assert(0);
3295         }
3296     }
3297
3298     if (!current_ref_assigned) {
3299         /* Second field of complementary field pair; the first field of
3300          * which is already referenced. If short referenced, it
3301          * should be first entry in short_ref. If not, it must exist
3302          * in long_ref; trying to put it on the short list here is an
3303          * error in the encoded bit stream (ref: 7.4.3.3, NOTE 2 and 3).
3304          */
3305         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3306             /* Just mark the second field valid */
3307             s->current_picture_ptr->reference = PICT_FRAME;
3308         } else if (s->current_picture_ptr->long_ref) {
3309             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3310                                              "assignment for second field "
3311                                              "in complementary field pair "
3312                                              "(first field is long term)\n");
3313         } else {
3314             pic= remove_short(h, s->current_picture_ptr->frame_num, 0);
3315             if(pic){
3316                 av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3317             }
3318
3319             if(h->short_ref_count)
3320                 memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3321
3322             h->short_ref[0]= s->current_picture_ptr;
3323             h->short_ref_count++;
3324             s->current_picture_ptr->reference |= s->picture_structure;
3325         }
3326     }
3327
3328     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3329
3330         /* We have too many reference frames, probably due to corrupted
3331          * stream. Need to discard one frame. Prevents overrun of the
3332          * short_ref and long_ref buffers.
3333          */
3334         av_log(h->s.avctx, AV_LOG_ERROR,
3335                "number of reference frames exceeds max (probably "
3336                "corrupt input), discarding one\n");
3337
3338         if (h->long_ref_count && !h->short_ref_count) {
3339             for (i = 0; i < 16; ++i)
3340                 if (h->long_ref[i])
3341                     break;
3342
3343             assert(i < 16);
3344             remove_long(h, i, 0);
3345         } else {
3346             pic = h->short_ref[h->short_ref_count - 1];
3347             remove_short(h, pic->frame_num, 0);
3348         }
3349     }
3350
3351     print_short_term(h);
3352     print_long_term(h);
3353     return 0;
3354 }
3355
3356 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3357     MpegEncContext * const s = &h->s;
3358     int i;
3359
3360     h->mmco_index= 0;
3361     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3362         s->broken_link= get_bits1(gb) -1;
3363         if(get_bits1(gb)){
3364             h->mmco[0].opcode= MMCO_LONG;
3365             h->mmco[0].long_arg= 0;
3366             h->mmco_index= 1;
3367         }
3368     }else{
3369         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3370             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3371                 MMCOOpcode opcode= get_ue_golomb(gb);
3372
3373                 h->mmco[i].opcode= opcode;
3374                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3375                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3376 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3377                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3378                         return -1;
3379                     }*/
3380                 }
3381                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3382                     unsigned int long_arg= get_ue_golomb(gb);
3383                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3384                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3385                         return -1;
3386                     }
3387                     h->mmco[i].long_arg= long_arg;
3388                 }
3389
3390                 if(opcode > (unsigned)MMCO_LONG){
3391                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3392                     return -1;
3393                 }
3394                 if(opcode == MMCO_END)
3395                     break;
3396             }
3397             h->mmco_index= i;
3398         }else{
3399             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3400
3401             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3402                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3403                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3404                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3405                 h->mmco_index= 1;
3406                 if (FIELD_PICTURE) {
3407                     h->mmco[0].short_pic_num *= 2;
3408                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3409                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3410                     h->mmco_index= 2;
3411                 }
3412             }
3413         }
3414     }
3415
3416     return 0;
3417 }
3418
3419 static int init_poc(H264Context *h){
3420     MpegEncContext * const s = &h->s;
3421     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3422     int field_poc[2];
3423     Picture *cur = s->current_picture_ptr;
3424
3425     h->frame_num_offset= h->prev_frame_num_offset;
3426     if(h->frame_num < h->prev_frame_num)
3427         h->frame_num_offset += max_frame_num;
3428
3429     if(h->sps.poc_type==0){
3430         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3431
3432         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3433             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3434         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3435             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3436         else
3437             h->poc_msb = h->prev_poc_msb;
3438 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3439         field_poc[0] =
3440         field_poc[1] = h->poc_msb + h->poc_lsb;
3441         if(s->picture_structure == PICT_FRAME)
3442             field_poc[1] += h->delta_poc_bottom;
3443     }else if(h->sps.poc_type==1){
3444         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3445         int i;
3446
3447         if(h->sps.poc_cycle_length != 0)
3448             abs_frame_num = h->frame_num_offset + h->frame_num;
3449         else
3450             abs_frame_num = 0;
3451
3452         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3453             abs_frame_num--;
3454
3455         expected_delta_per_poc_cycle = 0;
3456         for(i=0; i < h->sps.poc_cycle_length; i++)
3457             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3458
3459         if(abs_frame_num > 0){
3460             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3461             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3462
3463             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3464             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3465                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3466         } else
3467             expectedpoc = 0;
3468
3469         if(h->nal_ref_idc == 0)
3470             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3471
3472         field_poc[0] = expectedpoc + h->delta_poc[0];
3473         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3474
3475         if(s->picture_structure == PICT_FRAME)
3476             field_poc[1] += h->delta_poc[1];
3477     }else{
3478         int poc= 2*(h->frame_num_offset + h->frame_num);
3479
3480         if(!h->nal_ref_idc)
3481             poc--;
3482
3483         field_poc[0]= poc;
3484         field_poc[1]= poc;
3485     }
3486
3487     if(s->picture_structure != PICT_BOTTOM_FIELD)
3488         s->current_picture_ptr->field_poc[0]= field_poc[0];
3489     if(s->picture_structure != PICT_TOP_FIELD)
3490         s->current_picture_ptr->field_poc[1]= field_poc[1];
3491     cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3492
3493     return 0;
3494 }
3495
3496
3497 /**
3498  * initialize scan tables
3499  */
3500 static void init_scan_tables(H264Context *h){
3501     MpegEncContext * const s = &h->s;
3502     int i;
3503     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3504         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3505         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3506     }else{
3507         for(i=0; i<16; i++){
3508 #define T(x) (x>>2) | ((x<<2) & 0xF)
3509             h->zigzag_scan[i] = T(zigzag_scan[i]);
3510             h-> field_scan[i] = T( field_scan[i]);
3511 #undef T
3512         }
3513     }
3514     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3515         memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
3516         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3517         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3518         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3519     }else{
3520         for(i=0; i<64; i++){
3521 #define T(x) (x>>3) | ((x&7)<<3)
3522             h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
3523             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3524             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3525             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3526 #undef T
3527         }
3528     }
3529     if(h->sps.transform_bypass){ //FIXME same ugly
3530         h->zigzag_scan_q0          = zigzag_scan;
3531         h->zigzag_scan8x8_q0       = zigzag_scan8x8;
3532         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3533         h->field_scan_q0           = field_scan;
3534         h->field_scan8x8_q0        = field_scan8x8;
3535         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3536     }else{
3537         h->zigzag_scan_q0          = h->zigzag_scan;
3538         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3539         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3540         h->field_scan_q0           = h->field_scan;
3541         h->field_scan8x8_q0        = h->field_scan8x8;
3542         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3543     }
3544 }
3545
3546 /**
3547  * Replicates H264 "master" context to thread contexts.
3548  */
3549 static void clone_slice(H264Context *dst, H264Context *src)
3550 {
3551     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3552     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3553     dst->s.current_picture      = src->s.current_picture;
3554     dst->s.linesize             = src->s.linesize;
3555     dst->s.uvlinesize           = src->s.uvlinesize;
3556     dst->s.first_field          = src->s.first_field;
3557
3558     dst->prev_poc_msb           = src->prev_poc_msb;
3559     dst->prev_poc_lsb           = src->prev_poc_lsb;
3560     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3561     dst->prev_frame_num         = src->prev_frame_num;
3562     dst->short_ref_count        = src->short_ref_count;
3563
3564     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3565     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3566     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3567     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3568
3569     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3570     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3571 }
3572
3573 /**
3574  * decodes a slice header.
3575  * This will also call MPV_common_init() and frame_start() as needed.
3576  *
3577  * @param h h264context
3578  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3579  *
3580  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
3581  */
3582 static int decode_slice_header(H264Context *h, H264Context *h0){
3583     MpegEncContext * const s = &h->s;
3584     MpegEncContext * const s0 = &h0->s;
3585     unsigned int first_mb_in_slice;
3586     unsigned int pps_id;
3587     int num_ref_idx_active_override_flag;
3588     unsigned int slice_type, tmp, i, j;
3589     int default_ref_list_done = 0;
3590     int last_pic_structure;
3591
3592     s->dropable= h->nal_ref_idc == 0;
3593
3594     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3595         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3596         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3597     }else{
3598         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3599         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3600     }
3601
3602     first_mb_in_slice= get_ue_golomb(&s->gb);
3603
3604     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3605         h0->current_slice = 0;
3606         if (!s0->first_field)
3607             s->current_picture_ptr= NULL;
3608     }
3609
3610     slice_type= get_ue_golomb(&s->gb);
3611     if(slice_type > 9){
3612         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3613         return -1;
3614     }
3615     if(slice_type > 4){
3616         slice_type -= 5;
3617         h->slice_type_fixed=1;
3618     }else
3619         h->slice_type_fixed=0;
3620
3621     slice_type= golomb_to_pict_type[ slice_type ];
3622     if (slice_type == FF_I_TYPE
3623         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3624         default_ref_list_done = 1;
3625     }
3626     h->slice_type= slice_type;
3627     h->slice_type_nos= slice_type & 3;
3628
3629     s->pict_type= h->slice_type; // to make a few old functions happy, it's wrong though
3630     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3631         av_log(h->s.avctx, AV_LOG_ERROR,
3632                "B picture before any references, skipping\n");
3633         return -1;
3634     }
3635
3636     pps_id= get_ue_golomb(&s->gb);
3637     if(pps_id>=MAX_PPS_COUNT){
3638         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3639         return -1;
3640     }
3641     if(!h0->pps_buffers[pps_id]) {
3642         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing PPS referenced\n");
3643         return -1;
3644     }
3645     h->pps= *h0->pps_buffers[pps_id];
3646
3647     if(!h0->sps_buffers[h->pps.sps_id]) {
3648         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS referenced\n");
3649         return -1;
3650     }
3651     h->sps = *h0->sps_buffers[h->pps.sps_id];
3652
3653     if(h == h0 && h->dequant_coeff_pps != pps_id){
3654         h->dequant_coeff_pps = pps_id;
3655         init_dequant_tables(h);
3656     }
3657
3658     s->mb_width= h->sps.mb_width;
3659     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3660
3661     h->b_stride=  s->mb_width*4;
3662     h->b8_stride= s->mb_width*2;
3663
3664     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
3665     if(h->sps.frame_mbs_only_flag)
3666         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
3667     else
3668         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 3);
3669
3670     if (s->context_initialized
3671         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3672         if(h != h0)
3673             return -1;   // width / height changed during parallelized decoding
3674         free_tables(h);
3675         flush_dpb(s->avctx);
3676         MPV_common_end(s);
3677     }
3678     if (!s->context_initialized) {
3679         if(h != h0)
3680             return -1;  // we cant (re-)initialize context during parallel decoding
3681         if (MPV_common_init(s) < 0)
3682             return -1;
3683         s->first_field = 0;
3684
3685         init_scan_tables(h);
3686         alloc_tables(h);
3687
3688         for(i = 1; i < s->avctx->thread_count; i++) {
3689             H264Context *c;
3690             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3691             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3692             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3693             c->sps = h->sps;
3694             c->pps = h->pps;
3695             init_scan_tables(c);
3696             clone_tables(c, h);
3697         }
3698
3699         for(i = 0; i < s->avctx->thread_count; i++)
3700             if(context_init(h->thread_context[i]) < 0)
3701                 return -1;
3702
3703         s->avctx->width = s->width;
3704         s->avctx->height = s->height;
3705         s->avctx->sample_aspect_ratio= h->sps.sar;
3706         if(!s->avctx->sample_aspect_ratio.den)
3707             s->avctx->sample_aspect_ratio.den = 1;
3708
3709         if(h->sps.timing_info_present_flag){
3710             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
3711             if(h->x264_build > 0 && h->x264_build < 44)
3712                 s->avctx->time_base.den *= 2;
3713             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3714                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3715         }
3716     }
3717
3718     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3719
3720     h->mb_mbaff = 0;
3721     h->mb_aff_frame = 0;
3722     last_pic_structure = s0->picture_structure;
3723     if(h->sps.frame_mbs_only_flag){
3724         s->picture_structure= PICT_FRAME;
3725     }else{
3726         if(get_bits1(&s->gb)) { //field_pic_flag
3727             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3728         } else {
3729             s->picture_structure= PICT_FRAME;
3730             h->mb_aff_frame = h->sps.mb_aff;
3731         }
3732     }
3733     h->mb_field_decoding_flag= s->picture_structure != PICT_FRAME;
3734
3735     if(h0->current_slice == 0){
3736         while(h->frame_num !=  h->prev_frame_num &&
3737               h->frame_num != (h->prev_frame_num+1)%(1<<h->sps.log2_max_frame_num)){
3738             av_log(NULL, AV_LOG_DEBUG, "Frame num gap %d %d\n", h->frame_num, h->prev_frame_num);
3739             frame_start(h);
3740             h->prev_frame_num++;
3741             h->prev_frame_num %= 1<<h->sps.log2_max_frame_num;
3742             s->current_picture_ptr->frame_num= h->prev_frame_num;
3743             execute_ref_pic_marking(h, NULL, 0);
3744         }
3745
3746         /* See if we have a decoded first field looking for a pair... */
3747         if (s0->first_field) {
3748             assert(s0->current_picture_ptr);
3749             assert(s0->current_picture_ptr->data[0]);
3750             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
3751
3752             /* figure out if we have a complementary field pair */
3753             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
3754                 /*
3755                  * Previous field is unmatched. Don't display it, but let it
3756                  * remain for reference if marked as such.
3757                  */
3758                 s0->current_picture_ptr = NULL;
3759                 s0->first_field = FIELD_PICTURE;
3760
3761             } else {
3762                 if (h->nal_ref_idc &&
3763                         s0->current_picture_ptr->reference &&
3764                         s0->current_picture_ptr->frame_num != h->frame_num) {
3765                     /*
3766                      * This and previous field were reference, but had
3767                      * different frame_nums. Consider this field first in
3768                      * pair. Throw away previous field except for reference
3769                      * purposes.
3770                      */
3771                     s0->first_field = 1;
3772                     s0->current_picture_ptr = NULL;
3773
3774                 } else {
3775                     /* Second field in complementary pair */
3776                     s0->first_field = 0;
3777                 }
3778             }
3779
3780         } else {
3781             /* Frame or first field in a potentially complementary pair */
3782             assert(!s0->current_picture_ptr);
3783             s0->first_field = FIELD_PICTURE;
3784         }
3785
3786         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
3787             s0->first_field = 0;
3788             return -1;
3789         }
3790     }
3791     if(h != h0)
3792         clone_slice(h, h0);
3793
3794     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
3795
3796     assert(s->mb_num == s->mb_width * s->mb_height);
3797     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
3798        first_mb_in_slice                    >= s->mb_num){
3799         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
3800         return -1;
3801     }
3802     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
3803     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
3804     if (s->picture_structure == PICT_BOTTOM_FIELD)
3805         s->resync_mb_y = s->mb_y = s->mb_y + 1;
3806     assert(s->mb_y < s->mb_height);
3807
3808     if(s->picture_structure==PICT_FRAME){
3809         h->curr_pic_num=   h->frame_num;
3810         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
3811     }else{
3812         h->curr_pic_num= 2*h->frame_num + 1;
3813         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
3814     }
3815
3816     if(h->nal_unit_type == NAL_IDR_SLICE){
3817         get_ue_golomb(&s->gb); /* idr_pic_id */
3818     }
3819
3820     if(h->sps.poc_type==0){
3821         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
3822
3823         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
3824             h->delta_poc_bottom= get_se_golomb(&s->gb);
3825         }
3826     }
3827
3828     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
3829         h->delta_poc[0]= get_se_golomb(&s->gb);
3830
3831         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
3832             h->delta_poc[1]= get_se_golomb(&s->gb);
3833     }
3834
3835     init_poc(h);
3836
3837     if(h->pps.redundant_pic_cnt_present){
3838         h->redundant_pic_count= get_ue_golomb(&s->gb);
3839     }
3840
3841     //set defaults, might be overridden a few lines later
3842     h->ref_count[0]= h->pps.ref_count[0];
3843     h->ref_count[1]= h->pps.ref_count[1];
3844
3845     if(h->slice_type_nos != FF_I_TYPE){
3846         if(h->slice_type_nos == FF_B_TYPE){
3847             h->direct_spatial_mv_pred= get_bits1(&s->gb);
3848         }
3849         num_ref_idx_active_override_flag= get_bits1(&s->gb);
3850
3851         if(num_ref_idx_active_override_flag){
3852             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
3853             if(h->slice_type_nos==FF_B_TYPE)
3854                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
3855
3856             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
3857                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
3858                 h->ref_count[0]= h->ref_count[1]= 1;
3859                 return -1;
3860             }
3861         }
3862         if(h->slice_type_nos == FF_B_TYPE)
3863             h->list_count= 2;
3864         else
3865             h->list_count= 1;
3866     }else
3867         h->list_count= 0;
3868
3869     if(!default_ref_list_done){
3870         fill_default_ref_list(h);
3871     }
3872
3873     if(h->slice_type_nos!=FF_I_TYPE && decode_ref_pic_list_reordering(h) < 0)
3874         return -1;
3875
3876     if(h->slice_type_nos!=FF_I_TYPE){
3877         s->last_picture_ptr= &h->ref_list[0][0];
3878         ff_copy_picture(&s->last_picture, s->last_picture_ptr);
3879     }
3880     if(h->slice_type_nos==FF_B_TYPE){
3881         s->next_picture_ptr= &h->ref_list[1][0];
3882         ff_copy_picture(&s->next_picture, s->next_picture_ptr);
3883     }
3884
3885     if(   (h->pps.weighted_pred          && h->slice_type_nos == FF_P_TYPE )
3886        ||  (h->pps.weighted_bipred_idc==1 && h->slice_type_nos== FF_B_TYPE ) )
3887         pred_weight_table(h);
3888     else if(h->pps.weighted_bipred_idc==2 && h->slice_type_nos== FF_B_TYPE)
3889         implicit_weight_table(h);
3890     else
3891         h->use_weight = 0;
3892
3893     if(h->nal_ref_idc)
3894         decode_ref_pic_marking(h0, &s->gb);
3895
3896     if(FRAME_MBAFF)
3897         fill_mbaff_ref_list(h);
3898
3899     if(h->slice_type_nos==FF_B_TYPE && !h->direct_spatial_mv_pred)
3900         direct_dist_scale_factor(h);
3901     direct_ref_list_init(h);
3902
3903     if( h->slice_type_nos != FF_I_TYPE && h->pps.cabac ){
3904         tmp = get_ue_golomb(&s->gb);
3905         if(tmp > 2){
3906             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
3907             return -1;
3908         }
3909         h->cabac_init_idc= tmp;
3910     }
3911
3912     h->last_qscale_diff = 0;
3913     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
3914     if(tmp>51){
3915         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
3916         return -1;
3917     }
3918     s->qscale= tmp;
3919     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
3920     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
3921     //FIXME qscale / qp ... stuff
3922     if(h->slice_type == FF_SP_TYPE){
3923         get_bits1(&s->gb); /* sp_for_switch_flag */
3924     }
3925     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
3926         get_se_golomb(&s->gb); /* slice_qs_delta */
3927     }
3928
3929     h->deblocking_filter = 1;
3930     h->slice_alpha_c0_offset = 0;
3931     h->slice_beta_offset = 0;
3932     if( h->pps.deblocking_filter_parameters_present ) {
3933         tmp= get_ue_golomb(&s->gb);
3934         if(tmp > 2){
3935             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
3936             return -1;
3937         }
3938         h->deblocking_filter= tmp;
3939         if(h->deblocking_filter < 2)
3940             h->deblocking_filter^= 1; // 1<->0
3941
3942         if( h->deblocking_filter ) {
3943             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
3944             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
3945         }
3946     }
3947
3948     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
3949        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type_nos != FF_I_TYPE)
3950        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type_nos == FF_B_TYPE)
3951        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
3952         h->deblocking_filter= 0;
3953
3954     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
3955         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
3956             /* Cheat slightly for speed:
3957                Do not bother to deblock across slices. */
3958             h->deblocking_filter = 2;
3959         } else {
3960             h0->max_contexts = 1;
3961             if(!h0->single_decode_warning) {
3962                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
3963                 h0->single_decode_warning = 1;
3964             }
3965             if(h != h0)
3966                 return 1; // deblocking switched inside frame
3967         }
3968     }
3969
3970 #if 0 //FMO
3971     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
3972         slice_group_change_cycle= get_bits(&s->gb, ?);
3973 #endif
3974
3975     h0->last_slice_type = slice_type;
3976     h->slice_num = ++h0->current_slice;
3977     if(h->slice_num >= MAX_SLICES){
3978         av_log(s->avctx, AV_LOG_ERROR, "Too many slices, increase MAX_SLICES and recompile\n");
3979     }
3980
3981     for(j=0; j<2; j++){
3982         int *ref2frm= h->ref2frm[h->slice_num&(MAX_SLICES-1)][j];
3983         ref2frm[0]=
3984         ref2frm[1]= -1;
3985         for(i=0; i<16; i++)
3986             ref2frm[i+2]= 4*h->ref_list[j][i].frame_num
3987                           +(h->ref_list[j][i].reference&3);
3988         ref2frm[18+0]=
3989         ref2frm[18+1]= -1;
3990         for(i=16; i<48; i++)
3991             ref2frm[i+4]= 4*h->ref_list[j][i].frame_num
3992                           +(h->ref_list[j][i].reference&3);
3993     }
3994
3995     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
3996     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
3997
3998     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
3999         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c%s%s pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
4000                h->slice_num,
4001                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4002                first_mb_in_slice,
4003                av_get_pict_type_char(h->slice_type), h->slice_type_fixed ? " fix" : "", h->nal_unit_type == NAL_IDR_SLICE ? " IDR" : "",
4004                pps_id, h->frame_num,
4005                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4006                h->ref_count[0], h->ref_count[1],
4007                s->qscale,
4008                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4009                h->use_weight,
4010                h->use_weight==1 && h->use_weight_chroma ? "c" : "",
4011                h->slice_type == FF_B_TYPE ? (h->direct_spatial_mv_pred ? "SPAT" : "TEMP") : ""
4012                );
4013     }
4014
4015     return 0;
4016 }
4017
4018 /**
4019  *
4020  */
4021 static inline int get_level_prefix(GetBitContext *gb){
4022     unsigned int buf;
4023     int log;
4024
4025     OPEN_READER(re, gb);
4026     UPDATE_CACHE(re, gb);
4027     buf=GET_CACHE(re, gb);
4028
4029     log= 32 - av_log2(buf);
4030 #ifdef TRACE
4031     print_bin(buf>>(32-log), log);
4032     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4033 #endif
4034
4035     LAST_SKIP_BITS(re, gb, log);
4036     CLOSE_READER(re, gb);
4037
4038     return log-1;
4039 }
4040
4041 static inline int get_dct8x8_allowed(H264Context *h){
4042     if(h->sps.direct_8x8_inference_flag)
4043         return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8                )*0x0001000100010001ULL));
4044     else
4045         return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8|MB_TYPE_DIRECT2)*0x0001000100010001ULL));
4046 }
4047
4048 /**
4049  * decodes a residual block.
4050  * @param n block index
4051  * @param scantable scantable
4052  * @param max_coeff number of coefficients in the block
4053  * @return <0 if an error occurred
4054  */
4055 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4056     MpegEncContext * const s = &h->s;
4057     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4058     int level[16];
4059     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4060
4061     //FIXME put trailing_onex into the context
4062
4063     if(n == CHROMA_DC_BLOCK_INDEX){
4064         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4065         total_coeff= coeff_token>>2;
4066     }else{
4067         if(n == LUMA_DC_BLOCK_INDEX){
4068             total_coeff= pred_non_zero_count(h, 0);
4069             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4070             total_coeff= coeff_token>>2;
4071         }else{
4072             total_coeff= pred_non_zero_count(h, n);
4073             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4074             total_coeff= coeff_token>>2;
4075             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4076         }
4077     }
4078
4079     //FIXME set last_non_zero?
4080
4081     if(total_coeff==0)
4082         return 0;
4083     if(total_coeff > (unsigned)max_coeff) {
4084         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4085         return -1;
4086     }
4087
4088     trailing_ones= coeff_token&3;
4089     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4090     assert(total_coeff<=16);
4091
4092     i = show_bits(gb, 3);
4093     skip_bits(gb, trailing_ones);
4094     level[0] = 1-((i&4)>>1);
4095     level[1] = 1-((i&2)   );
4096     level[2] = 1-((i&1)<<1);
4097
4098     if(trailing_ones<total_coeff) {
4099         int level_code, mask;
4100         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4101         int prefix= get_level_prefix(gb);
4102
4103         //first coefficient has suffix_length equal to 0 or 1
4104         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4105             if(suffix_length)
4106                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4107             else
4108                 level_code= (prefix<<suffix_length); //part
4109         }else if(prefix==14){
4110             if(suffix_length)
4111                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4112             else
4113                 level_code= prefix + get_bits(gb, 4); //part
4114         }else{
4115             level_code= (15<<suffix_length) + get_bits(gb, prefix-3); //part
4116             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4117             if(prefix>=16)
4118                 level_code += (1<<(prefix-3))-4096;
4119         }
4120
4121         if(trailing_ones < 3) level_code += 2;
4122
4123         suffix_length = 1;
4124         if(level_code > 5)
4125             suffix_length++;
4126         mask= -(level_code&1);
4127         level[trailing_ones]= (((2+level_code)>>1) ^ mask) - mask;
4128
4129         //remaining coefficients have suffix_length > 0
4130         for(i=trailing_ones+1;i<total_coeff;i++) {
4131             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
4132             prefix = get_level_prefix(gb);
4133             if(prefix<15){
4134                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4135             }else{
4136                 level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
4137                 if(prefix>=16)
4138                     level_code += (1<<(prefix-3))-4096;
4139             }
4140             mask= -(level_code&1);
4141             level[i]= (((2+level_code)>>1) ^ mask) - mask;
4142             if(level_code > suffix_limit[suffix_length])
4143                 suffix_length++;
4144         }
4145     }
4146
4147     if(total_coeff == max_coeff)
4148         zeros_left=0;
4149     else{
4150         if(n == CHROMA_DC_BLOCK_INDEX)
4151             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4152         else
4153             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4154     }
4155
4156     coeff_num = zeros_left + total_coeff - 1;
4157     j = scantable[coeff_num];
4158     if(n > 24){
4159         block[j] = level[0];
4160         for(i=1;i<total_coeff;i++) {
4161             if(zeros_left <= 0)
4162                 run_before = 0;
4163             else if(zeros_left < 7){
4164                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4165             }else{
4166                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4167             }
4168             zeros_left -= run_before;
4169             coeff_num -= 1 + run_before;
4170             j= scantable[ coeff_num ];
4171
4172             block[j]= level[i];
4173         }
4174     }else{
4175         block[j] = (level[0] * qmul[j] + 32)>>6;
4176         for(i=1;i<total_coeff;i++) {
4177             if(zeros_left <= 0)
4178                 run_before = 0;
4179             else if(zeros_left < 7){
4180                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4181             }else{
4182                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4183             }
4184             zeros_left -= run_before;
4185             coeff_num -= 1 + run_before;
4186             j= scantable[ coeff_num ];
4187
4188             block[j]= (level[i] * qmul[j] + 32)>>6;
4189         }
4190     }
4191
4192     if(zeros_left<0){
4193         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4194         return -1;
4195     }
4196
4197     return 0;
4198 }
4199
4200 static void predict_field_decoding_flag(H264Context *h){
4201     MpegEncContext * const s = &h->s;
4202     const int mb_xy= h->mb_xy;
4203     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4204                 ? s->current_picture.mb_type[mb_xy-1]
4205                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4206                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4207                 : 0;
4208     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4209 }
4210
4211 /**
4212  * decodes a P_SKIP or B_SKIP macroblock
4213  */
4214 static void decode_mb_skip(H264Context *h){
4215     MpegEncContext * const s = &h->s;
4216     const int mb_xy= h->mb_xy;
4217     int mb_type=0;
4218
4219     memset(h->non_zero_count[mb_xy], 0, 16);
4220     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4221
4222     if(MB_FIELD)
4223         mb_type|= MB_TYPE_INTERLACED;
4224
4225     if( h->slice_type_nos == FF_B_TYPE )
4226     {
4227         // just for fill_caches. pred_direct_motion will set the real mb_type
4228         mb_type|= MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4229
4230         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4231         pred_direct_motion(h, &mb_type);
4232         mb_type|= MB_TYPE_SKIP;
4233     }
4234     else
4235     {
4236         int mx, my;
4237         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4238
4239         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4240         pred_pskip_motion(h, &mx, &my);
4241         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4242         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4243     }
4244
4245     write_back_motion(h, mb_type);
4246     s->current_picture.mb_type[mb_xy]= mb_type;
4247     s->current_picture.qscale_table[mb_xy]= s->qscale;
4248     h->slice_table[ mb_xy ]= h->slice_num;
4249     h->prev_mb_skipped= 1;
4250 }
4251
4252 /**
4253  * decodes a macroblock
4254  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4255  */
4256 static int decode_mb_cavlc(H264Context *h){
4257     MpegEncContext * const s = &h->s;
4258     int mb_xy;
4259     int partition_count;
4260     unsigned int mb_type, cbp;
4261     int dct8x8_allowed= h->pps.transform_8x8_mode;
4262
4263     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4264
4265     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
4266
4267     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4268     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4269                 down the code */
4270     if(h->slice_type_nos != FF_I_TYPE){
4271         if(s->mb_skip_run==-1)
4272             s->mb_skip_run= get_ue_golomb(&s->gb);
4273
4274         if (s->mb_skip_run--) {
4275             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4276                 if(s->mb_skip_run==0)
4277                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4278                 else
4279                     predict_field_decoding_flag(h);
4280             }
4281             decode_mb_skip(h);
4282             return 0;
4283         }
4284     }
4285     if(FRAME_MBAFF){
4286         if( (s->mb_y&1) == 0 )
4287             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4288     }
4289
4290     h->prev_mb_skipped= 0;
4291
4292     mb_type= get_ue_golomb(&s->gb);
4293     if(h->slice_type_nos == FF_B_TYPE){
4294         if(mb_type < 23){
4295             partition_count= b_mb_type_info[mb_type].partition_count;
4296             mb_type=         b_mb_type_info[mb_type].type;
4297         }else{
4298             mb_type -= 23;
4299             goto decode_intra_mb;
4300         }
4301     }else if(h->slice_type_nos == FF_P_TYPE){
4302         if(mb_type < 5){
4303             partition_count= p_mb_type_info[mb_type].partition_count;
4304             mb_type=         p_mb_type_info[mb_type].type;
4305         }else{
4306             mb_type -= 5;
4307             goto decode_intra_mb;
4308         }
4309     }else{
4310        assert(h->slice_type_nos == FF_I_TYPE);
4311         if(h->slice_type == FF_SI_TYPE && mb_type)
4312             mb_type--;
4313 decode_intra_mb:
4314         if(mb_type > 25){
4315             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4316             return -1;
4317         }
4318         partition_count=0;
4319         cbp= i_mb_type_info[mb_type].cbp;
4320         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4321         mb_type= i_mb_type_info[mb_type].type;
4322     }
4323
4324     if(MB_FIELD)
4325         mb_type |= MB_TYPE_INTERLACED;
4326
4327     h->slice_table[ mb_xy ]= h->slice_num;
4328
4329     if(IS_INTRA_PCM(mb_type)){
4330         unsigned int x;
4331
4332         // We assume these blocks are very rare so we do not optimize it.
4333         align_get_bits(&s->gb);
4334
4335         // The pixels are stored in the same order as levels in h->mb array.
4336         for(x=0; x < (CHROMA ? 384 : 256); x++){
4337             ((uint8_t*)h->mb)[x]= get_bits(&s->gb, 8);
4338         }
4339
4340         // In deblocking, the quantizer is 0
4341         s->current_picture.qscale_table[mb_xy]= 0;
4342         // All coeffs are present
4343         memset(h->non_zero_count[mb_xy], 16, 16);
4344
4345         s->current_picture.mb_type[mb_xy]= mb_type;
4346         return 0;
4347     }
4348
4349     if(MB_MBAFF){
4350         h->ref_count[0] <<= 1;
4351         h->ref_count[1] <<= 1;
4352     }
4353
4354     fill_caches(h, mb_type, 0);
4355
4356     //mb_pred
4357     if(IS_INTRA(mb_type)){
4358         int pred_mode;
4359 //            init_top_left_availability(h);
4360         if(IS_INTRA4x4(mb_type)){
4361             int i;
4362             int di = 1;
4363             if(dct8x8_allowed && get_bits1(&s->gb)){
4364                 mb_type |= MB_TYPE_8x8DCT;
4365                 di = 4;
4366             }
4367
4368 //                fill_intra4x4_pred_table(h);
4369             for(i=0; i<16; i+=di){
4370                 int mode= pred_intra_mode(h, i);
4371
4372                 if(!get_bits1(&s->gb)){
4373                     const int rem_mode= get_bits(&s->gb, 3);
4374                     mode = rem_mode + (rem_mode >= mode);
4375                 }
4376
4377                 if(di==4)
4378                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4379                 else
4380                     h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4381             }
4382             write_back_intra_pred_mode(h);
4383             if( check_intra4x4_pred_mode(h) < 0)
4384                 return -1;
4385         }else{
4386             h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4387             if(h->intra16x16_pred_mode < 0)
4388                 return -1;
4389         }
4390         if(CHROMA){
4391             pred_mode= check_intra_pred_mode(h, get_ue_golomb(&s->gb));
4392             if(pred_mode < 0)
4393                 return -1;
4394             h->chroma_pred_mode= pred_mode;
4395         }
4396     }else if(partition_count==4){
4397         int i, j, sub_partition_count[4], list, ref[2][4];
4398
4399         if(h->slice_type_nos == FF_B_TYPE){
4400             for(i=0; i<4; i++){
4401                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4402                 if(h->sub_mb_type[i] >=13){
4403                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4404                     return -1;
4405                 }
4406                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4407                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4408             }
4409             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4410                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4411                 pred_direct_motion(h, &mb_type);
4412                 h->ref_cache[0][scan8[4]] =
4413                 h->ref_cache[1][scan8[4]] =
4414                 h->ref_cache[0][scan8[12]] =
4415                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4416             }
4417         }else{
4418             assert(h->slice_type_nos == FF_P_TYPE); //FIXME SP correct ?
4419             for(i=0; i<4; i++){
4420                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4421                 if(h->sub_mb_type[i] >=4){
4422                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4423                     return -1;
4424                 }
4425                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4426                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4427             }
4428         }
4429
4430         for(list=0; list<h->list_count; list++){
4431             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4432             for(i=0; i<4; i++){
4433                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4434                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4435                     unsigned int tmp = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
4436                     if(tmp>=ref_count){
4437                         av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4438                         return -1;
4439                     }
4440                     ref[list][i]= tmp;
4441                 }else{
4442                  //FIXME
4443                     ref[list][i] = -1;
4444                 }
4445             }
4446         }
4447
4448         if(dct8x8_allowed)
4449             dct8x8_allowed = get_dct8x8_allowed(h);
4450
4451         for(list=0; list<h->list_count; list++){
4452             for(i=0; i<4; i++){
4453                 if(IS_DIRECT(h->sub_mb_type[i])) {
4454                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4455                     continue;
4456                 }
4457                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4458                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4459
4460                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4461                     const int sub_mb_type= h->sub_mb_type[i];
4462                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4463                     for(j=0; j<sub_partition_count[i]; j++){
4464                         int mx, my;
4465                         const int index= 4*i + block_width*j;
4466                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4467                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4468                         mx += get_se_golomb(&s->gb);
4469                         my += get_se_golomb(&s->gb);
4470                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4471
4472                         if(IS_SUB_8X8(sub_mb_type)){
4473                             mv_cache[ 1 ][0]=
4474                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4475                             mv_cache[ 1 ][1]=
4476                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4477                         }else if(IS_SUB_8X4(sub_mb_type)){
4478                             mv_cache[ 1 ][0]= mx;
4479                             mv_cache[ 1 ][1]= my;
4480                         }else if(IS_SUB_4X8(sub_mb_type)){
4481                             mv_cache[ 8 ][0]= mx;
4482                             mv_cache[ 8 ][1]= my;
4483                         }
4484                         mv_cache[ 0 ][0]= mx;
4485                         mv_cache[ 0 ][1]= my;
4486                     }
4487                 }else{
4488                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4489                     p[0] = p[1]=
4490                     p[8] = p[9]= 0;
4491                 }
4492             }
4493         }
4494     }else if(IS_DIRECT(mb_type)){
4495         pred_direct_motion(h, &mb_type);
4496         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4497     }else{
4498         int list, mx, my, i;
4499          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4500         if(IS_16X16(mb_type)){
4501             for(list=0; list<h->list_count; list++){
4502                     unsigned int val;
4503                     if(IS_DIR(mb_type, 0, list)){
4504                         val= get_te0_golomb(&s->gb, h->ref_count[list]);
4505                         if(val >= h->ref_count[list]){
4506                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4507                             return -1;
4508                         }
4509                     }else
4510                         val= LIST_NOT_USED&0xFF;
4511                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4512             }
4513             for(list=0; list<h->list_count; list++){
4514                 unsigned int val;
4515                 if(IS_DIR(mb_type, 0, list)){
4516                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4517                     mx += get_se_golomb(&s->gb);
4518                     my += get_se_golomb(&s->gb);
4519                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4520
4521                     val= pack16to32(mx,my);
4522                 }else
4523                     val=0;
4524                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4525             }
4526         }
4527         else if(IS_16X8(mb_type)){
4528             for(list=0; list<h->list_count; list++){
4529                     for(i=0; i<2; i++){
4530                         unsigned int val;
4531                         if(IS_DIR(mb_type, i, list)){
4532                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4533                             if(val >= h->ref_count[list]){
4534                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4535                                 return -1;
4536                             }
4537                         }else
4538                             val= LIST_NOT_USED&0xFF;
4539                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4540                     }
4541             }
4542             for(list=0; list<h->list_count; list++){
4543                 for(i=0; i<2; i++){
4544                     unsigned int val;
4545                     if(IS_DIR(mb_type, i, list)){
4546                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4547                         mx += get_se_golomb(&s->gb);
4548                         my += get_se_golomb(&s->gb);
4549                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4550
4551                         val= pack16to32(mx,my);
4552                     }else
4553                         val=0;
4554                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4555                 }
4556             }
4557         }else{
4558             assert(IS_8X16(mb_type));
4559             for(list=0; list<h->list_count; list++){
4560                     for(i=0; i<2; i++){
4561                         unsigned int val;
4562                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4563                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4564                             if(val >= h->ref_count[list]){
4565                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4566                                 return -1;
4567                             }
4568                         }else
4569                             val= LIST_NOT_USED&0xFF;
4570                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4571                     }
4572             }
4573             for(list=0; list<h->list_count; list++){
4574                 for(i=0; i<2; i++){
4575                     unsigned int val;
4576                     if(IS_DIR(mb_type, i, list)){
4577                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4578                         mx += get_se_golomb(&s->gb);
4579                         my += get_se_golomb(&s->gb);
4580                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4581
4582                         val= pack16to32(mx,my);
4583                     }else
4584                         val=0;
4585                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4586                 }
4587             }
4588         }
4589     }
4590
4591     if(IS_INTER(mb_type))
4592         write_back_motion(h, mb_type);
4593
4594     if(!IS_INTRA16x16(mb_type)){
4595         cbp= get_ue_golomb(&s->gb);
4596         if(cbp > 47){
4597             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4598             return -1;
4599         }
4600
4601         if(CHROMA){
4602             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp[cbp];
4603             else                     cbp= golomb_to_inter_cbp   [cbp];
4604         }else{
4605             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp_gray[cbp];
4606             else                     cbp= golomb_to_inter_cbp_gray[cbp];
4607         }
4608     }
4609     h->cbp = cbp;
4610
4611     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4612         if(get_bits1(&s->gb)){
4613             mb_type |= MB_TYPE_8x8DCT;
4614             h->cbp_table[mb_xy]= cbp;
4615         }
4616     }
4617     s->current_picture.mb_type[mb_xy]= mb_type;
4618
4619     if(cbp || IS_INTRA16x16(mb_type)){
4620         int i8x8, i4x4, chroma_idx;
4621         int dquant;
4622         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4623         const uint8_t *scan, *scan8x8, *dc_scan;
4624
4625 //        fill_non_zero_count_cache(h);
4626
4627         if(IS_INTERLACED(mb_type)){
4628             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4629             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4630             dc_scan= luma_dc_field_scan;
4631         }else{
4632             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4633             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4634             dc_scan= luma_dc_zigzag_scan;
4635         }
4636
4637         dquant= get_se_golomb(&s->gb);
4638
4639         if( dquant > 25 || dquant < -26 ){
4640             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4641             return -1;
4642         }
4643
4644         s->qscale += dquant;
4645         if(((unsigned)s->qscale) > 51){
4646             if(s->qscale<0) s->qscale+= 52;
4647             else            s->qscale-= 52;
4648         }
4649
4650         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4651         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4652         if(IS_INTRA16x16(mb_type)){
4653             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4654                 return -1; //FIXME continue if partitioned and other return -1 too
4655             }
4656
4657             assert((cbp&15) == 0 || (cbp&15) == 15);
4658
4659             if(cbp&15){
4660                 for(i8x8=0; i8x8<4; i8x8++){
4661                     for(i4x4=0; i4x4<4; i4x4++){
4662                         const int index= i4x4 + 4*i8x8;
4663                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4664                             return -1;
4665                         }
4666                     }
4667                 }
4668             }else{
4669                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4670             }
4671         }else{
4672             for(i8x8=0; i8x8<4; i8x8++){
4673                 if(cbp & (1<<i8x8)){
4674                     if(IS_8x8DCT(mb_type)){
4675                         DCTELEM *buf = &h->mb[64*i8x8];
4676                         uint8_t *nnz;
4677                         for(i4x4=0; i4x4<4; i4x4++){
4678                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4679                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4680                                 return -1;
4681                         }
4682                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4683                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4684                     }else{
4685                         for(i4x4=0; i4x4<4; i4x4++){
4686                             const int index= i4x4 + 4*i8x8;
4687
4688                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4689                                 return -1;
4690                             }
4691                         }
4692                     }
4693                 }else{
4694                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4695                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4696                 }
4697             }
4698         }
4699
4700         if(cbp&0x30){
4701             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4702                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4703                     return -1;
4704                 }
4705         }
4706
4707         if(cbp&0x20){
4708             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4709                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4710                 for(i4x4=0; i4x4<4; i4x4++){
4711                     const int index= 16 + 4*chroma_idx + i4x4;
4712                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4713                         return -1;
4714                     }
4715                 }
4716             }
4717         }else{
4718             uint8_t * const nnz= &h->non_zero_count_cache[0];
4719             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4720             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4721         }
4722     }else{
4723         uint8_t * const nnz= &h->non_zero_count_cache[0];
4724         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4725         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4726         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4727     }
4728     s->current_picture.qscale_table[mb_xy]= s->qscale;
4729     write_back_non_zero_count(h);
4730
4731     if(MB_MBAFF){
4732         h->ref_count[0] >>= 1;
4733         h->ref_count[1] >>= 1;
4734     }
4735
4736     return 0;
4737 }
4738
4739 static int decode_cabac_field_decoding_flag(H264Context *h) {
4740     MpegEncContext * const s = &h->s;
4741     const int mb_x = s->mb_x;
4742     const int mb_y = s->mb_y & ~1;
4743     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4744     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4745
4746     unsigned int ctx = 0;
4747
4748     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
4749         ctx += 1;
4750     }
4751     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
4752         ctx += 1;
4753     }
4754
4755     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
4756 }
4757
4758 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4759     uint8_t *state= &h->cabac_state[ctx_base];
4760     int mb_type;
4761
4762     if(intra_slice){
4763         MpegEncContext * const s = &h->s;
4764         const int mba_xy = h->left_mb_xy[0];
4765         const int mbb_xy = h->top_mb_xy;
4766         int ctx=0;
4767         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
4768             ctx++;
4769         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
4770             ctx++;
4771         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
4772             return 0;   /* I4x4 */
4773         state += 2;
4774     }else{
4775         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
4776             return 0;   /* I4x4 */
4777     }
4778
4779     if( get_cabac_terminate( &h->cabac ) )
4780         return 25;  /* PCM */
4781
4782     mb_type = 1; /* I16x16 */
4783     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
4784     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
4785         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
4786     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
4787     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
4788     return mb_type;
4789 }
4790
4791 static int decode_cabac_mb_type( H264Context *h ) {
4792     MpegEncContext * const s = &h->s;
4793
4794     if( h->slice_type_nos == FF_I_TYPE ) {
4795         return decode_cabac_intra_mb_type(h, 3, 1);
4796     } else if( h->slice_type_nos == FF_P_TYPE ) {
4797         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
4798             /* P-type */
4799             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
4800                 /* P_L0_D16x16, P_8x8 */
4801                 return 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
4802             } else {
4803                 /* P_L0_D8x16, P_L0_D16x8 */
4804                 return 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
4805             }
4806         } else {
4807             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
4808         }
4809     } else {
4810         const int mba_xy = h->left_mb_xy[0];
4811         const int mbb_xy = h->top_mb_xy;
4812         int ctx = 0;
4813         int bits;
4814         assert(h->slice_type_nos == FF_B_TYPE);
4815
4816         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
4817             ctx++;
4818         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
4819             ctx++;
4820
4821         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
4822             return 0; /* B_Direct_16x16 */
4823
4824         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
4825             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
4826         }
4827
4828         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
4829         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
4830         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
4831         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4832         if( bits < 8 )
4833             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
4834         else if( bits == 13 ) {
4835             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
4836         } else if( bits == 14 )
4837             return 11; /* B_L1_L0_8x16 */
4838         else if( bits == 15 )
4839             return 22; /* B_8x8 */
4840
4841         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4842         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
4843     }
4844 }
4845
4846 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
4847     MpegEncContext * const s = &h->s;
4848     int mba_xy, mbb_xy;
4849     int ctx = 0;
4850
4851     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
4852         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
4853         mba_xy = mb_xy - 1;
4854         if( (mb_y&1)
4855             && h->slice_table[mba_xy] == h->slice_num
4856             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
4857             mba_xy += s->mb_stride;
4858         if( MB_FIELD ){
4859             mbb_xy = mb_xy - s->mb_stride;
4860             if( !(mb_y&1)
4861                 && h->slice_table[mbb_xy] == h->slice_num
4862                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
4863                 mbb_xy -= s->mb_stride;
4864         }else
4865             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
4866     }else{
4867         int mb_xy = h->mb_xy;
4868         mba_xy = mb_xy - 1;
4869         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
4870     }
4871
4872     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
4873         ctx++;
4874     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
4875         ctx++;
4876
4877     if( h->slice_type_nos == FF_B_TYPE )
4878         ctx += 13;
4879     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
4880 }
4881
4882 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
4883     int mode = 0;
4884
4885     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
4886         return pred_mode;
4887
4888     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
4889     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
4890     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
4891
4892     if( mode >= pred_mode )
4893         return mode + 1;
4894     else
4895         return mode;
4896 }
4897
4898 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
4899     const int mba_xy = h->left_mb_xy[0];
4900     const int mbb_xy = h->top_mb_xy;
4901
4902     int ctx = 0;
4903
4904     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
4905     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
4906         ctx++;
4907
4908     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
4909         ctx++;
4910
4911     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
4912         return 0;
4913
4914     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4915         return 1;
4916     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4917         return 2;
4918     else
4919         return 3;
4920 }
4921
4922 static int decode_cabac_mb_cbp_luma( H264Context *h) {
4923     int cbp_b, cbp_a, ctx, cbp = 0;
4924
4925     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
4926     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
4927
4928     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
4929     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
4930     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
4931     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
4932     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
4933     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
4934     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
4935     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
4936     return cbp;
4937 }
4938 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
4939     int ctx;
4940     int cbp_a, cbp_b;
4941
4942     cbp_a = (h->left_cbp>>4)&0x03;
4943     cbp_b = (h-> top_cbp>>4)&0x03;
4944
4945     ctx = 0;
4946     if( cbp_a > 0 ) ctx++;
4947     if( cbp_b > 0 ) ctx += 2;
4948     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
4949         return 0;
4950
4951     ctx = 4;
4952     if( cbp_a == 2 ) ctx++;
4953     if( cbp_b == 2 ) ctx += 2;
4954     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
4955 }
4956 static int decode_cabac_mb_dqp( H264Context *h) {
4957     int   ctx= h->last_qscale_diff != 0;
4958     int   val = 0;
4959
4960     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
4961         ctx= 2+(ctx>>1);
4962         val++;
4963         if(val > 102) //prevent infinite loop
4964             return INT_MIN;
4965     }
4966
4967     if( val&0x01 )
4968         return   (val + 1)>>1 ;
4969     else
4970         return -((val + 1)>>1);
4971 }
4972 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
4973     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
4974         return 0;   /* 8x8 */
4975     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
4976         return 1;   /* 8x4 */
4977     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
4978         return 2;   /* 4x8 */
4979     return 3;       /* 4x4 */
4980 }
4981 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
4982     int type;
4983     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
4984         return 0;   /* B_Direct_8x8 */
4985     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
4986         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
4987     type = 3;
4988     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
4989         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
4990             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
4991         type += 4;
4992     }
4993     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
4994     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
4995     return type;
4996 }
4997
4998 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
4999     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5000 }
5001
5002 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5003     int refa = h->ref_cache[list][scan8[n] - 1];
5004     int refb = h->ref_cache[list][scan8[n] - 8];
5005     int ref  = 0;
5006     int ctx  = 0;
5007
5008     if( h->slice_type_nos == FF_B_TYPE) {
5009         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5010             ctx++;
5011         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5012             ctx += 2;
5013     } else {
5014         if( refa > 0 )
5015             ctx++;
5016         if( refb > 0 )
5017             ctx += 2;
5018     }
5019
5020     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5021         ref++;
5022         if( ctx < 4 )
5023             ctx = 4;
5024         else
5025             ctx = 5;
5026         if(ref >= 32 /*h->ref_list[list]*/){
5027             return -1;
5028         }
5029     }
5030     return ref;
5031 }
5032
5033 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5034     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5035                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5036     int ctxbase = (l == 0) ? 40 : 47;
5037     int mvd;
5038     int ctx = (amvd>2) + (amvd>32);
5039
5040     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5041         return 0;
5042
5043     mvd= 1;
5044     ctx= 3;
5045     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5046         mvd++;
5047         if( ctx < 6 )
5048             ctx++;
5049     }
5050
5051     if( mvd >= 9 ) {
5052         int k = 3;
5053         while( get_cabac_bypass( &h->cabac ) ) {
5054             mvd += 1 << k;
5055             k++;
5056             if(k>24){
5057                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5058                 return INT_MIN;
5059             }
5060         }
5061         while( k-- ) {
5062             if( get_cabac_bypass( &h->cabac ) )
5063                 mvd += 1 << k;
5064         }
5065     }
5066     return get_cabac_bypass_sign( &h->cabac, -mvd );
5067 }
5068
5069 static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
5070     int nza, nzb;
5071     int ctx = 0;
5072
5073     if( is_dc ) {
5074         if( cat == 0 ) {
5075             nza = h->left_cbp&0x100;
5076             nzb = h-> top_cbp&0x100;
5077         } else {
5078             nza = (h->left_cbp>>(6+idx))&0x01;
5079             nzb = (h-> top_cbp>>(6+idx))&0x01;
5080         }
5081     } else {
5082         assert(cat == 1 || cat == 2 || cat == 4);
5083         nza = h->non_zero_count_cache[scan8[idx] - 1];
5084         nzb = h->non_zero_count_cache[scan8[idx] - 8];
5085     }
5086
5087     if( nza > 0 )
5088         ctx++;
5089
5090     if( nzb > 0 )
5091         ctx += 2;
5092
5093     return ctx + 4 * cat;
5094 }
5095
5096 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8[63]) = {
5097     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5098     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5099     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5100     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5101 };
5102
5103 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
5104     static const int significant_coeff_flag_offset[2][6] = {
5105       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5106       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5107     };
5108     static const int last_coeff_flag_offset[2][6] = {
5109       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5110       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5111     };
5112     static const int coeff_abs_level_m1_offset[6] = {
5113         227+0, 227+10, 227+20, 227+30, 227+39, 426
5114     };
5115     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5116       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5117         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5118         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5119        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5120       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5121         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5122         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5123         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5124     };
5125     /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
5126      * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
5127      * map node ctx => cabac ctx for level=1 */
5128     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
5129     /* map node ctx => cabac ctx for level>1 */
5130     static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
5131     static const uint8_t coeff_abs_level_transition[2][8] = {
5132     /* update node ctx after decoding a level=1 */
5133         { 1, 2, 3, 3, 4, 5, 6, 7 },
5134     /* update node ctx after decoding a level>1 */
5135         { 4, 4, 4, 4, 5, 6, 7, 7 }
5136     };
5137
5138     int index[64];
5139
5140     int av_unused last;
5141     int coeff_count = 0;
5142     int node_ctx = 0;
5143
5144     uint8_t *significant_coeff_ctx_base;
5145     uint8_t *last_coeff_ctx_base;
5146     uint8_t *abs_level_m1_ctx_base;
5147
5148 #ifndef ARCH_X86
5149 #define CABAC_ON_STACK
5150 #endif
5151 #ifdef CABAC_ON_STACK
5152 #define CC &cc
5153     CABACContext cc;
5154     cc.range     = h->cabac.range;
5155     cc.low       = h->cabac.low;
5156     cc.bytestream= h->cabac.bytestream;
5157 #else
5158 #define CC &h->cabac
5159 #endif
5160
5161
5162     /* cat: 0-> DC 16x16  n = 0
5163      *      1-> AC 16x16  n = luma4x4idx
5164      *      2-> Luma4x4   n = luma4x4idx
5165      *      3-> DC Chroma n = iCbCr
5166      *      4-> AC Chroma n = 16 + 4 * iCbCr + chroma4x4idx
5167      *      5-> Luma8x8   n = 4 * luma8x8idx
5168      */
5169
5170     /* read coded block flag */
5171     if( is_dc || cat != 5 ) {
5172         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
5173             if( !is_dc )
5174                 h->non_zero_count_cache[scan8[n]] = 0;
5175
5176 #ifdef CABAC_ON_STACK
5177             h->cabac.range     = cc.range     ;
5178             h->cabac.low       = cc.low       ;
5179             h->cabac.bytestream= cc.bytestream;
5180 #endif
5181             return;
5182         }
5183     }
5184
5185     significant_coeff_ctx_base = h->cabac_state
5186         + significant_coeff_flag_offset[MB_FIELD][cat];
5187     last_coeff_ctx_base = h->cabac_state
5188         + last_coeff_flag_offset[MB_FIELD][cat];
5189     abs_level_m1_ctx_base = h->cabac_state
5190         + coeff_abs_level_m1_offset[cat];
5191
5192     if( !is_dc && cat == 5 ) {
5193 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5194         for(last= 0; last < coefs; last++) { \
5195             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5196             if( get_cabac( CC, sig_ctx )) { \
5197                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5198                 index[coeff_count++] = last; \
5199                 if( get_cabac( CC, last_ctx ) ) { \
5200                     last= max_coeff; \
5201                     break; \
5202                 } \
5203             } \
5204         }\
5205         if( last == max_coeff -1 ) {\
5206             index[coeff_count++] = last;\
5207         }
5208         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5209 #if defined(ARCH_X86) && defined(HAVE_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
5210         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5211     } else {
5212         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5213 #else
5214         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5215     } else {
5216         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5217 #endif
5218     }
5219     assert(coeff_count > 0);
5220
5221     if( is_dc ) {
5222         if( cat == 0 )
5223             h->cbp_table[h->mb_xy] |= 0x100;
5224         else
5225             h->cbp_table[h->mb_xy] |= 0x40 << n;
5226     } else {
5227         if( cat == 5 )
5228             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5229         else {
5230             assert( cat == 1 || cat == 2 || cat == 4 );
5231             h->non_zero_count_cache[scan8[n]] = coeff_count;
5232         }
5233     }
5234
5235     do {
5236         uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
5237
5238         int j= scantable[index[--coeff_count]];
5239
5240         if( get_cabac( CC, ctx ) == 0 ) {
5241             node_ctx = coeff_abs_level_transition[0][node_ctx];
5242             if( is_dc ) {
5243                 block[j] = get_cabac_bypass_sign( CC, -1);
5244             }else{
5245                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5246             }
5247         } else {
5248             int coeff_abs = 2;
5249             ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
5250             node_ctx = coeff_abs_level_transition[1][node_ctx];
5251
5252             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5253                 coeff_abs++;
5254             }
5255
5256             if( coeff_abs >= 15 ) {
5257                 int j = 0;
5258                 while( get_cabac_bypass( CC ) ) {
5259                     j++;
5260                 }
5261
5262                 coeff_abs=1;
5263                 while( j-- ) {
5264                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5265                 }
5266                 coeff_abs+= 14;
5267             }
5268
5269             if( is_dc ) {
5270                 block[j] = get_cabac_bypass_sign( CC, -coeff_abs );
5271             }else{
5272                 block[j] = (get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32) >> 6;
5273             }
5274         }
5275     } while( coeff_count );
5276 #ifdef CABAC_ON_STACK
5277             h->cabac.range     = cc.range     ;
5278             h->cabac.low       = cc.low       ;
5279             h->cabac.bytestream= cc.bytestream;
5280 #endif
5281
5282 }
5283
5284 #ifndef CONFIG_SMALL
5285 static void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5286     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 1);
5287 }
5288
5289 static void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5290     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
5291 }
5292 #endif
5293
5294 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5295 #ifdef CONFIG_SMALL
5296     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, cat == 0 || cat == 3);
5297 #else
5298     if( cat == 0 || cat == 3 ) decode_cabac_residual_dc(h, block, cat, n, scantable, qmul, max_coeff);
5299     else decode_cabac_residual_nondc(h, block, cat, n, scantable, qmul, max_coeff);
5300 #endif
5301 }
5302
5303 static inline void compute_mb_neighbors(H264Context *h)
5304 {
5305     MpegEncContext * const s = &h->s;
5306     const int mb_xy  = h->mb_xy;
5307     h->top_mb_xy     = mb_xy - s->mb_stride;
5308     h->left_mb_xy[0] = mb_xy - 1;
5309     if(FRAME_MBAFF){
5310         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5311         const int top_pair_xy      = pair_xy     - s->mb_stride;
5312         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5313         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5314         const int curr_mb_frame_flag = !MB_FIELD;
5315         const int bottom = (s->mb_y & 1);
5316         if (bottom
5317                 ? !curr_mb_frame_flag // bottom macroblock
5318                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
5319                 ) {
5320             h->top_mb_xy -= s->mb_stride;
5321         }
5322         if (left_mb_frame_flag != curr_mb_frame_flag) {
5323             h->left_mb_xy[0] = pair_xy - 1;
5324         }
5325     } else if (FIELD_PICTURE) {
5326         h->top_mb_xy -= s->mb_stride;
5327     }
5328     return;
5329 }
5330
5331 /**
5332  * decodes a macroblock
5333  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5334  */
5335 static int decode_mb_cabac(H264Context *h) {
5336     MpegEncContext * const s = &h->s;
5337     int mb_xy;
5338     int mb_type, partition_count, cbp = 0;
5339     int dct8x8_allowed= h->pps.transform_8x8_mode;
5340
5341     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5342
5343     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
5344
5345     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5346     if( h->slice_type_nos != FF_I_TYPE ) {
5347         int skip;
5348         /* a skipped mb needs the aff flag from the following mb */
5349         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5350             predict_field_decoding_flag(h);
5351         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5352             skip = h->next_mb_skipped;
5353         else
5354             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5355         /* read skip flags */
5356         if( skip ) {
5357             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5358                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5359                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5360                 if(h->next_mb_skipped)
5361                     predict_field_decoding_flag(h);
5362                 else
5363                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5364             }
5365
5366             decode_mb_skip(h);
5367
5368             h->cbp_table[mb_xy] = 0;
5369             h->chroma_pred_mode_table[mb_xy] = 0;
5370             h->last_qscale_diff = 0;
5371
5372             return 0;
5373
5374         }
5375     }
5376     if(FRAME_MBAFF){
5377         if( (s->mb_y&1) == 0 )
5378             h->mb_mbaff =
5379             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5380     }
5381
5382     h->prev_mb_skipped = 0;
5383
5384     compute_mb_neighbors(h);
5385     mb_type = decode_cabac_mb_type( h );
5386     assert(mb_type >= 0);
5387
5388     if( h->slice_type_nos == FF_B_TYPE ) {
5389         if( mb_type < 23 ){
5390             partition_count= b_mb_type_info[mb_type].partition_count;
5391             mb_type=         b_mb_type_info[mb_type].type;
5392         }else{
5393             mb_type -= 23;
5394             goto decode_intra_mb;
5395         }
5396     } else if( h->slice_type_nos == FF_P_TYPE ) {
5397         if( mb_type < 5) {
5398             partition_count= p_mb_type_info[mb_type].partition_count;
5399             mb_type=         p_mb_type_info[mb_type].type;
5400         } else {
5401             mb_type -= 5;
5402             goto decode_intra_mb;
5403         }
5404     } else {
5405         if(h->slice_type == FF_SI_TYPE && mb_type)
5406             mb_type--;
5407         assert(h->slice_type_nos == FF_I_TYPE);
5408 decode_intra_mb:
5409         partition_count = 0;
5410         cbp= i_mb_type_info[mb_type].cbp;
5411         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5412         mb_type= i_mb_type_info[mb_type].type;
5413     }
5414     if(MB_FIELD)
5415         mb_type |= MB_TYPE_INTERLACED;
5416
5417     h->slice_table[ mb_xy ]= h->slice_num;
5418
5419     if(IS_INTRA_PCM(mb_type)) {
5420         const uint8_t *ptr;
5421
5422         // We assume these blocks are very rare so we do not optimize it.
5423         // FIXME The two following lines get the bitstream position in the cabac
5424         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5425         ptr= h->cabac.bytestream;
5426         if(h->cabac.low&0x1) ptr--;
5427         if(CABAC_BITS==16){
5428             if(h->cabac.low&0x1FF) ptr--;
5429         }
5430
5431         // The pixels are stored in the same order as levels in h->mb array.
5432         memcpy(h->mb, ptr, 256); ptr+=256;
5433         if(CHROMA){
5434             memcpy(h->mb+128, ptr, 128); ptr+=128;
5435         }
5436
5437         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5438
5439         // All blocks are present
5440         h->cbp_table[mb_xy] = 0x1ef;
5441         h->chroma_pred_mode_table[mb_xy] = 0;
5442         // In deblocking, the quantizer is 0
5443         s->current_picture.qscale_table[mb_xy]= 0;
5444         // All coeffs are present
5445         memset(h->non_zero_count[mb_xy], 16, 16);
5446         s->current_picture.mb_type[mb_xy]= mb_type;
5447         h->last_qscale_diff = 0;
5448         return 0;
5449     }
5450
5451     if(MB_MBAFF){
5452         h->ref_count[0] <<= 1;
5453         h->ref_count[1] <<= 1;
5454     }
5455
5456     fill_caches(h, mb_type, 0);
5457
5458     if( IS_INTRA( mb_type ) ) {
5459         int i, pred_mode;
5460         if( IS_INTRA4x4( mb_type ) ) {
5461             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5462                 mb_type |= MB_TYPE_8x8DCT;
5463                 for( i = 0; i < 16; i+=4 ) {
5464                     int pred = pred_intra_mode( h, i );
5465                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5466                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5467                 }
5468             } else {
5469                 for( i = 0; i < 16; i++ ) {
5470                     int pred = pred_intra_mode( h, i );
5471                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5472
5473                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5474                 }
5475             }
5476             write_back_intra_pred_mode(h);
5477             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5478         } else {
5479             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5480             if( h->intra16x16_pred_mode < 0 ) return -1;
5481         }
5482         if(CHROMA){
5483             h->chroma_pred_mode_table[mb_xy] =
5484             pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5485
5486             pred_mode= check_intra_pred_mode( h, pred_mode );
5487             if( pred_mode < 0 ) return -1;
5488             h->chroma_pred_mode= pred_mode;
5489         }
5490     } else if( partition_count == 4 ) {
5491         int i, j, sub_partition_count[4], list, ref[2][4];
5492
5493         if( h->slice_type_nos == FF_B_TYPE ) {
5494             for( i = 0; i < 4; i++ ) {
5495                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5496                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5497                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5498             }
5499             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5500                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5501                 pred_direct_motion(h, &mb_type);
5502                 h->ref_cache[0][scan8[4]] =
5503                 h->ref_cache[1][scan8[4]] =
5504                 h->ref_cache[0][scan8[12]] =
5505                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5506                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5507                     for( i = 0; i < 4; i++ )
5508                         if( IS_DIRECT(h->sub_mb_type[i]) )
5509                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5510                 }
5511             }
5512         } else {
5513             for( i = 0; i < 4; i++ ) {
5514                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5515                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5516                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5517             }
5518         }
5519
5520         for( list = 0; list < h->list_count; list++ ) {
5521                 for( i = 0; i < 4; i++ ) {
5522                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5523                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5524                         if( h->ref_count[list] > 1 ){
5525                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5526                             if(ref[list][i] >= (unsigned)h->ref_count[list]){
5527                                 av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref[list][i], h->ref_count[list]);
5528                                 return -1;
5529                             }
5530                         }else
5531                             ref[list][i] = 0;
5532                     } else {
5533                         ref[list][i] = -1;
5534                     }
5535                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5536                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5537                 }
5538         }
5539
5540         if(dct8x8_allowed)
5541             dct8x8_allowed = get_dct8x8_allowed(h);
5542
5543         for(list=0; list<h->list_count; list++){
5544             for(i=0; i<4; i++){
5545                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5546                 if(IS_DIRECT(h->sub_mb_type[i])){
5547                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5548                     continue;
5549                 }
5550
5551                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5552                     const int sub_mb_type= h->sub_mb_type[i];
5553                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5554                     for(j=0; j<sub_partition_count[i]; j++){
5555                         int mpx, mpy;
5556                         int mx, my;
5557                         const int index= 4*i + block_width*j;
5558                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5559                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5560                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5561
5562                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5563                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5564                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5565
5566                         if(IS_SUB_8X8(sub_mb_type)){
5567                             mv_cache[ 1 ][0]=
5568                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5569                             mv_cache[ 1 ][1]=
5570                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5571
5572                             mvd_cache[ 1 ][0]=
5573                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5574                             mvd_cache[ 1 ][1]=
5575                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5576                         }else if(IS_SUB_8X4(sub_mb_type)){
5577                             mv_cache[ 1 ][0]= mx;
5578                             mv_cache[ 1 ][1]= my;
5579
5580                             mvd_cache[ 1 ][0]= mx - mpx;
5581                             mvd_cache[ 1 ][1]= my - mpy;
5582                         }else if(IS_SUB_4X8(sub_mb_type)){
5583                             mv_cache[ 8 ][0]= mx;
5584                             mv_cache[ 8 ][1]= my;
5585
5586                             mvd_cache[ 8 ][0]= mx - mpx;
5587                             mvd_cache[ 8 ][1]= my - mpy;
5588                         }
5589                         mv_cache[ 0 ][0]= mx;
5590                         mv_cache[ 0 ][1]= my;
5591
5592                         mvd_cache[ 0 ][0]= mx - mpx;
5593                         mvd_cache[ 0 ][1]= my - mpy;
5594                     }
5595                 }else{
5596                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5597                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5598                     p[0] = p[1] = p[8] = p[9] = 0;
5599                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5600                 }
5601             }
5602         }
5603     } else if( IS_DIRECT(mb_type) ) {
5604         pred_direct_motion(h, &mb_type);
5605         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5606         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5607         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5608     } else {
5609         int list, mx, my, i, mpx, mpy;
5610         if(IS_16X16(mb_type)){
5611             for(list=0; list<h->list_count; list++){
5612                 if(IS_DIR(mb_type, 0, list)){
5613                     int ref;
5614                     if(h->ref_count[list] > 1){
5615                         ref= decode_cabac_mb_ref(h, list, 0);
5616                         if(ref >= (unsigned)h->ref_count[list]){
5617                             av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5618                             return -1;
5619                         }
5620                     }else
5621                         ref=0;
5622                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5623                 }else
5624                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5625             }
5626             for(list=0; list<h->list_count; list++){
5627                 if(IS_DIR(mb_type, 0, list)){
5628                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5629
5630                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5631                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5632                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5633
5634                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5635                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5636                 }else
5637                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5638             }
5639         }
5640         else if(IS_16X8(mb_type)){
5641             for(list=0; list<h->list_count; list++){
5642                     for(i=0; i<2; i++){
5643                         if(IS_DIR(mb_type, i, list)){
5644                             int ref;
5645                             if(h->ref_count[list] > 1){
5646                                 ref= decode_cabac_mb_ref( h, list, 8*i );
5647                                 if(ref >= (unsigned)h->ref_count[list]){
5648                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5649                                     return -1;
5650                                 }
5651                             }else
5652                                 ref=0;
5653                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5654                         }else
5655                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5656                     }
5657             }
5658             for(list=0; list<h->list_count; list++){
5659                 for(i=0; i<2; i++){
5660                     if(IS_DIR(mb_type, i, list)){
5661                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5662                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5663                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5664                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5665
5666                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5667                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5668                     }else{
5669                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5670                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5671                     }
5672                 }
5673             }
5674         }else{
5675             assert(IS_8X16(mb_type));
5676             for(list=0; list<h->list_count; list++){
5677                     for(i=0; i<2; i++){
5678                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5679                             int ref;
5680                             if(h->ref_count[list] > 1){
5681                                 ref= decode_cabac_mb_ref( h, list, 4*i );
5682                                 if(ref >= (unsigned)h->ref_count[list]){
5683                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5684                                     return -1;
5685                                 }
5686                             }else
5687                                 ref=0;
5688                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5689                         }else
5690                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5691                     }
5692             }
5693             for(list=0; list<h->list_count; list++){
5694                 for(i=0; i<2; i++){
5695                     if(IS_DIR(mb_type, i, list)){
5696                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5697                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5698                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5699
5700                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5701                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5702                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5703                     }else{
5704                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5705                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5706                     }
5707                 }
5708             }
5709         }
5710     }
5711
5712    if( IS_INTER( mb_type ) ) {
5713         h->chroma_pred_mode_table[mb_xy] = 0;
5714         write_back_motion( h, mb_type );
5715    }
5716
5717     if( !IS_INTRA16x16( mb_type ) ) {
5718         cbp  = decode_cabac_mb_cbp_luma( h );
5719         if(CHROMA)
5720             cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5721     }
5722
5723     h->cbp_table[mb_xy] = h->cbp = cbp;
5724
5725     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5726         if( decode_cabac_mb_transform_size( h ) )
5727             mb_type |= MB_TYPE_8x8DCT;
5728     }
5729     s->current_picture.mb_type[mb_xy]= mb_type;
5730
5731     if( cbp || IS_INTRA16x16( mb_type ) ) {
5732         const uint8_t *scan, *scan8x8, *dc_scan;
5733         const uint32_t *qmul;
5734         int dqp;
5735
5736         if(IS_INTERLACED(mb_type)){
5737             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5738             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5739             dc_scan= luma_dc_field_scan;
5740         }else{
5741             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5742             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5743             dc_scan= luma_dc_zigzag_scan;
5744         }
5745
5746         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5747         if( dqp == INT_MIN ){
5748             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5749             return -1;
5750         }
5751         s->qscale += dqp;
5752         if(((unsigned)s->qscale) > 51){
5753             if(s->qscale<0) s->qscale+= 52;
5754             else            s->qscale-= 52;
5755         }
5756         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5757         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5758
5759         if( IS_INTRA16x16( mb_type ) ) {
5760             int i;
5761             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5762             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5763
5764             if( cbp&15 ) {
5765                 qmul = h->dequant4_coeff[0][s->qscale];
5766                 for( i = 0; i < 16; i++ ) {
5767                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5768                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
5769                 }
5770             } else {
5771                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5772             }
5773         } else {
5774             int i8x8, i4x4;
5775             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5776                 if( cbp & (1<<i8x8) ) {
5777                     if( IS_8x8DCT(mb_type) ) {
5778                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
5779                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
5780                     } else {
5781                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
5782                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
5783                             const int index = 4*i8x8 + i4x4;
5784                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
5785 //START_TIMER
5786                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
5787 //STOP_TIMER("decode_residual")
5788                         }
5789                     }
5790                 } else {
5791                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5792                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5793                 }
5794             }
5795         }
5796
5797         if( cbp&0x30 ){
5798             int c;
5799             for( c = 0; c < 2; c++ ) {
5800                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
5801                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
5802             }
5803         }
5804
5805         if( cbp&0x20 ) {
5806             int c, i;
5807             for( c = 0; c < 2; c++ ) {
5808                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
5809                 for( i = 0; i < 4; i++ ) {
5810                     const int index = 16 + 4 * c + i;
5811                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
5812                     decode_cabac_residual(h, h->mb + 16*index, 4, index, scan + 1, qmul, 15);
5813                 }
5814             }
5815         } else {
5816             uint8_t * const nnz= &h->non_zero_count_cache[0];
5817             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5818             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5819         }
5820     } else {
5821         uint8_t * const nnz= &h->non_zero_count_cache[0];
5822         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5823         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5824         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5825         h->last_qscale_diff = 0;
5826     }
5827
5828     s->current_picture.qscale_table[mb_xy]= s->qscale;
5829     write_back_non_zero_count(h);
5830
5831     if(MB_MBAFF){
5832         h->ref_count[0] >>= 1;
5833         h->ref_count[1] >>= 1;
5834     }
5835
5836     return 0;
5837 }
5838
5839
5840 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5841     int i, d;
5842     const int index_a = qp + h->slice_alpha_c0_offset;
5843     const int alpha = (alpha_table+52)[index_a];
5844     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5845
5846     if( bS[0] < 4 ) {
5847         int8_t tc[4];
5848         tc[0] = (tc0_table+52)[index_a][bS[0]];
5849         tc[1] = (tc0_table+52)[index_a][bS[1]];
5850         tc[2] = (tc0_table+52)[index_a][bS[2]];
5851         tc[3] = (tc0_table+52)[index_a][bS[3]];
5852         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
5853     } else {
5854         h->s.dsp.h264_h_loop_filter_luma_intra(pix, stride, alpha, beta);
5855     }
5856 }
5857 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5858     int i;
5859     const int index_a = qp + h->slice_alpha_c0_offset;
5860     const int alpha = (alpha_table+52)[index_a];
5861     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5862
5863     if( bS[0] < 4 ) {
5864         int8_t tc[4];
5865         tc[0] = (tc0_table+52)[index_a][bS[0]]+1;
5866         tc[1] = (tc0_table+52)[index_a][bS[1]]+1;
5867         tc[2] = (tc0_table+52)[index_a][bS[2]]+1;
5868         tc[3] = (tc0_table+52)[index_a][bS[3]]+1;
5869         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
5870     } else {
5871         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
5872     }
5873 }
5874
5875 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
5876     int i;
5877     for( i = 0; i < 16; i++, pix += stride) {
5878         int index_a;
5879         int alpha;
5880         int beta;
5881
5882         int qp_index;
5883         int bS_index = (i >> 1);
5884         if (!MB_FIELD) {
5885             bS_index &= ~1;
5886             bS_index |= (i & 1);
5887         }
5888
5889         if( bS[bS_index] == 0 ) {
5890             continue;
5891         }
5892
5893         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
5894         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
5895         alpha = (alpha_table+52)[index_a];
5896         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
5897
5898         if( bS[bS_index] < 4 ) {
5899             const int tc0 = (tc0_table+52)[index_a][bS[bS_index]];
5900             const int p0 = pix[-1];
5901             const int p1 = pix[-2];
5902             const int p2 = pix[-3];
5903             const int q0 = pix[0];
5904             const int q1 = pix[1];
5905             const int q2 = pix[2];
5906
5907             if( FFABS( p0 - q0 ) < alpha &&
5908                 FFABS( p1 - p0 ) < beta &&
5909                 FFABS( q1 - q0 ) < beta ) {
5910                 int tc = tc0;
5911                 int i_delta;
5912
5913                 if( FFABS( p2 - p0 ) < beta ) {
5914                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
5915                     tc++;
5916                 }
5917                 if( FFABS( q2 - q0 ) < beta ) {
5918                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
5919                     tc++;
5920                 }
5921
5922                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
5923                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
5924                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
5925                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
5926             }
5927         }else{
5928             const int p0 = pix[-1];
5929             const int p1 = pix[-2];
5930             const int p2 = pix[-3];
5931
5932             const int q0 = pix[0];
5933             const int q1 = pix[1];
5934             const int q2 = pix[2];
5935
5936             if( FFABS( p0 - q0 ) < alpha &&
5937                 FFABS( p1 - p0 ) < beta &&
5938                 FFABS( q1 - q0 ) < beta ) {
5939
5940                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
5941                     if( FFABS( p2 - p0 ) < beta)
5942                     {
5943                         const int p3 = pix[-4];
5944                         /* p0', p1', p2' */
5945                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
5946                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
5947                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
5948                     } else {
5949                         /* p0' */
5950                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5951                     }
5952                     if( FFABS( q2 - q0 ) < beta)
5953                     {
5954                         const int q3 = pix[3];
5955                         /* q0', q1', q2' */
5956                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
5957                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
5958                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
5959                     } else {
5960                         /* q0' */
5961                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5962                     }
5963                 }else{
5964                     /* p0', q0' */
5965                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5966                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5967                 }
5968                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
5969             }
5970         }
5971     }
5972 }
5973 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
5974     int i;
5975     for( i = 0; i < 8; i++, pix += stride) {
5976         int index_a;
5977         int alpha;
5978         int beta;
5979
5980         int qp_index;
5981         int bS_index = i;
5982
5983         if( bS[bS_index] == 0 ) {
5984             continue;
5985         }
5986
5987         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
5988         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
5989         alpha = (alpha_table+52)[index_a];
5990         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
5991
5992         if( bS[bS_index] < 4 ) {
5993             const int tc = (tc0_table+52)[index_a][bS[bS_index]] + 1;
5994             const int p0 = pix[-1];
5995             const int p1 = pix[-2];
5996             const int q0 = pix[0];
5997             const int q1 = pix[1];
5998
5999             if( FFABS( p0 - q0 ) < alpha &&
6000                 FFABS( p1 - p0 ) < beta &&
6001                 FFABS( q1 - q0 ) < beta ) {
6002                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6003
6004                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6005                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6006                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6007             }
6008         }else{
6009             const int p0 = pix[-1];
6010             const int p1 = pix[-2];
6011             const int q0 = pix[0];
6012             const int q1 = pix[1];
6013
6014             if( FFABS( p0 - q0 ) < alpha &&
6015                 FFABS( p1 - p0 ) < beta &&
6016                 FFABS( q1 - q0 ) < beta ) {
6017
6018                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6019                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6020                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6021             }
6022         }
6023     }
6024 }
6025
6026 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6027     int i, d;
6028     const int index_a = qp + h->slice_alpha_c0_offset;
6029     const int alpha = (alpha_table+52)[index_a];
6030     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6031     const int pix_next  = stride;
6032
6033     if( bS[0] < 4 ) {
6034         int8_t tc[4];
6035         tc[0] = (tc0_table+52)[index_a][bS[0]];
6036         tc[1] = (tc0_table+52)[index_a][bS[1]];
6037         tc[2] = (tc0_table+52)[index_a][bS[2]];
6038         tc[3] = (tc0_table+52)[index_a][bS[3]];
6039         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6040     } else {
6041         h->s.dsp.h264_v_loop_filter_luma_intra(pix, stride, alpha, beta);
6042     }
6043 }
6044
6045 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6046     int i;
6047     const int index_a = qp + h->slice_alpha_c0_offset;
6048     const int alpha = (alpha_table+52)[index_a];
6049     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6050
6051     if( bS[0] < 4 ) {
6052         int8_t tc[4];
6053         tc[0] = (tc0_table+52)[index_a][bS[0]]+1;
6054         tc[1] = (tc0_table+52)[index_a][bS[1]]+1;
6055         tc[2] = (tc0_table+52)[index_a][bS[2]]+1;
6056         tc[3] = (tc0_table+52)[index_a][bS[3]]+1;
6057         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6058     } else {
6059         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6060     }
6061 }
6062
6063 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6064     MpegEncContext * const s = &h->s;
6065     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6066     int mb_xy, mb_type;
6067     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6068
6069     mb_xy = h->mb_xy;
6070
6071     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6072         !(s->flags2 & CODEC_FLAG2_FAST) || //FIXME filter_mb_fast is broken, thus hasto be, but should not under CODEC_FLAG2_FAST
6073        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6074                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6075         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6076         return;
6077     }
6078     assert(!FRAME_MBAFF);
6079
6080     mb_type = s->current_picture.mb_type[mb_xy];
6081     qp = s->current_picture.qscale_table[mb_xy];
6082     qp0 = s->current_picture.qscale_table[mb_xy-1];
6083     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6084     qpc = get_chroma_qp( h, 0, qp );
6085     qpc0 = get_chroma_qp( h, 0, qp0 );
6086     qpc1 = get_chroma_qp( h, 0, qp1 );
6087     qp0 = (qp + qp0 + 1) >> 1;
6088     qp1 = (qp + qp1 + 1) >> 1;
6089     qpc0 = (qpc + qpc0 + 1) >> 1;
6090     qpc1 = (qpc + qpc1 + 1) >> 1;
6091     qp_thresh = 15 - h->slice_alpha_c0_offset;
6092     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6093        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6094         return;
6095
6096     if( IS_INTRA(mb_type) ) {
6097         int16_t bS4[4] = {4,4,4,4};
6098         int16_t bS3[4] = {3,3,3,3};
6099         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6100         if( IS_8x8DCT(mb_type) ) {
6101             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6102             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6103             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6104             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6105         } else {
6106             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6107             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6108             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6109             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6110             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6111             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6112             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6113             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6114         }
6115         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6116         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6117         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6118         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6119         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6120         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6121         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6122         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6123         return;
6124     } else {
6125         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6126         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6127         int edges;
6128         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6129             edges = 4;
6130             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6131         } else {
6132             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6133                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6134             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6135                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6136                              ? 3 : 0;
6137             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6138             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6139             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6140                                               (h->slice_type_nos == FF_B_TYPE), edges, step, mask_edge0, mask_edge1, FIELD_PICTURE);
6141         }
6142         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6143             bSv[0][0] = 0x0004000400040004ULL;
6144         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6145             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6146
6147 #define FILTER(hv,dir,edge)\
6148         if(bSv[dir][edge]) {\
6149             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6150             if(!(edge&1)) {\
6151                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6152                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6153             }\
6154         }
6155         if( edges == 1 ) {
6156             FILTER(v,0,0);
6157             FILTER(h,1,0);
6158         } else if( IS_8x8DCT(mb_type) ) {
6159             FILTER(v,0,0);
6160             FILTER(v,0,2);
6161             FILTER(h,1,0);
6162             FILTER(h,1,2);
6163         } else {
6164             FILTER(v,0,0);
6165             FILTER(v,0,1);
6166             FILTER(v,0,2);
6167             FILTER(v,0,3);
6168             FILTER(h,1,0);
6169             FILTER(h,1,1);
6170             FILTER(h,1,2);
6171             FILTER(h,1,3);
6172         }
6173 #undef FILTER
6174     }
6175 }
6176
6177
6178 static void av_always_inline filter_mb_dir(H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int mb_xy, int mb_type, int mvy_limit, int first_vertical_edge_done, int dir) {
6179     MpegEncContext * const s = &h->s;
6180     int edge;
6181     const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6182     const int mbm_type = s->current_picture.mb_type[mbm_xy];
6183     int (*ref2frm) [64] = h->ref2frm[ h->slice_num          &(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6184     int (*ref2frmm)[64] = h->ref2frm[ h->slice_table[mbm_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6185     int start = h->slice_table[mbm_xy] == 0xFFFF ? 1 : 0;
6186
6187     const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6188                               == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6189     // how often to recheck mv-based bS when iterating between edges
6190     const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6191                           (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6192     // how often to recheck mv-based bS when iterating along each edge
6193     const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6194
6195     if (first_vertical_edge_done) {
6196         start = 1;
6197     }
6198
6199     if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6200         start = 1;
6201
6202     if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6203         && !IS_INTERLACED(mb_type)
6204         && IS_INTERLACED(mbm_type)
6205         ) {
6206         // This is a special case in the norm where the filtering must
6207         // be done twice (one each of the field) even if we are in a
6208         // frame macroblock.
6209         //
6210         static const int nnz_idx[4] = {4,5,6,3};
6211         unsigned int tmp_linesize   = 2 *   linesize;
6212         unsigned int tmp_uvlinesize = 2 * uvlinesize;
6213         int mbn_xy = mb_xy - 2 * s->mb_stride;
6214         int qp;
6215         int i, j;
6216         int16_t bS[4];
6217
6218         for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6219             if( IS_INTRA(mb_type) ||
6220                 IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6221                 bS[0] = bS[1] = bS[2] = bS[3] = 3;
6222             } else {
6223                 const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6224                 for( i = 0; i < 4; i++ ) {
6225                     if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6226                         mbn_nnz[nnz_idx[i]] != 0 )
6227                         bS[i] = 2;
6228                     else
6229                         bS[i] = 1;
6230                 }
6231             }
6232             // Do not use s->qscale as luma quantizer because it has not the same
6233             // value in IPCM macroblocks.
6234             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6235             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6236             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6237             filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6238             filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6239                               ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6240             filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6241                               ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6242         }
6243
6244         start = 1;
6245     }
6246
6247     /* Calculate bS */
6248     for( edge = start; edge < edges; edge++ ) {
6249         /* mbn_xy: neighbor macroblock */
6250         const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6251         const int mbn_type = s->current_picture.mb_type[mbn_xy];
6252         int (*ref2frmn)[64] = edge > 0 ? ref2frm : ref2frmm;
6253         int16_t bS[4];
6254         int qp;
6255
6256         if( (edge&1) && IS_8x8DCT(mb_type) )
6257             continue;
6258
6259         if( IS_INTRA(mb_type) ||
6260             IS_INTRA(mbn_type) ) {
6261             int value;
6262             if (edge == 0) {
6263                 if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6264                     || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6265                 ) {
6266                     value = 4;
6267                 } else {
6268                     value = 3;
6269                 }
6270             } else {
6271                 value = 3;
6272             }
6273             bS[0] = bS[1] = bS[2] = bS[3] = value;
6274         } else {
6275             int i, l;
6276             int mv_done;
6277
6278             if( edge & mask_edge ) {
6279                 bS[0] = bS[1] = bS[2] = bS[3] = 0;
6280                 mv_done = 1;
6281             }
6282             else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6283                 bS[0] = bS[1] = bS[2] = bS[3] = 1;
6284                 mv_done = 1;
6285             }
6286             else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6287                 int b_idx= 8 + 4 + edge * (dir ? 8:1);
6288                 int bn_idx= b_idx - (dir ? 8:1);
6289                 int v = 0;
6290
6291                 for( l = 0; !v && l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6292                     v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6293                          FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6294                          FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6295                 }
6296
6297                 if(h->slice_type_nos == FF_B_TYPE && v){
6298                     v=0;
6299                     for( l = 0; !v && l < 2; l++ ) {
6300                         int ln= 1-l;
6301                         v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6302                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6303                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit;
6304                     }
6305                 }
6306
6307                 bS[0] = bS[1] = bS[2] = bS[3] = v;
6308                 mv_done = 1;
6309             }
6310             else
6311                 mv_done = 0;
6312
6313             for( i = 0; i < 4; i++ ) {
6314                 int x = dir == 0 ? edge : i;
6315                 int y = dir == 0 ? i    : edge;
6316                 int b_idx= 8 + 4 + x + 8*y;
6317                 int bn_idx= b_idx - (dir ? 8:1);
6318
6319                 if( h->non_zero_count_cache[b_idx] |
6320                     h->non_zero_count_cache[bn_idx] ) {
6321                     bS[i] = 2;
6322                 }
6323                 else if(!mv_done)
6324                 {
6325                     bS[i] = 0;
6326                     for( l = 0; l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6327                         if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6328                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6329                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6330                             bS[i] = 1;
6331                             break;
6332                         }
6333                     }
6334
6335                     if(h->slice_type_nos == FF_B_TYPE && bS[i]){
6336                         bS[i] = 0;
6337                         for( l = 0; l < 2; l++ ) {
6338                             int ln= 1-l;
6339                             if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6340                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6341                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) {
6342                                 bS[i] = 1;
6343                                 break;
6344                             }
6345                         }
6346                     }
6347                 }
6348             }
6349
6350             if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6351                 continue;
6352         }
6353
6354         /* Filter edge */
6355         // Do not use s->qscale as luma quantizer because it has not the same
6356         // value in IPCM macroblocks.
6357         qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6358         //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6359         tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6360         { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6361         if( dir == 0 ) {
6362             filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6363             if( (edge&1) == 0 ) {
6364                 filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6365                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6366                 filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6367                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6368             }
6369         } else {
6370             filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6371             if( (edge&1) == 0 ) {
6372                 filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6373                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6374                 filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6375                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6376             }
6377         }
6378     }
6379 }
6380
6381 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6382     MpegEncContext * const s = &h->s;
6383     const int mb_xy= mb_x + mb_y*s->mb_stride;
6384     const int mb_type = s->current_picture.mb_type[mb_xy];
6385     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6386     int first_vertical_edge_done = 0;
6387     int dir;
6388
6389     //for sufficiently low qp, filtering wouldn't do anything
6390     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6391     if(!FRAME_MBAFF){
6392         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
6393         int qp = s->current_picture.qscale_table[mb_xy];
6394         if(qp <= qp_thresh
6395            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6396            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6397             return;
6398         }
6399     }
6400
6401     // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
6402     if(!h->pps.cabac && h->pps.transform_8x8_mode){
6403         int top_type, left_type[2];
6404         top_type     = s->current_picture.mb_type[h->top_mb_xy]    ;
6405         left_type[0] = s->current_picture.mb_type[h->left_mb_xy[0]];
6406         left_type[1] = s->current_picture.mb_type[h->left_mb_xy[1]];
6407
6408         if(IS_8x8DCT(top_type)){
6409             h->non_zero_count_cache[4+8*0]=
6410             h->non_zero_count_cache[5+8*0]= h->cbp_table[h->top_mb_xy] & 4;
6411             h->non_zero_count_cache[6+8*0]=
6412             h->non_zero_count_cache[7+8*0]= h->cbp_table[h->top_mb_xy] & 8;
6413         }
6414         if(IS_8x8DCT(left_type[0])){
6415             h->non_zero_count_cache[3+8*1]=
6416             h->non_zero_count_cache[3+8*2]= h->cbp_table[h->left_mb_xy[0]]&2; //FIXME check MBAFF
6417         }
6418         if(IS_8x8DCT(left_type[1])){
6419             h->non_zero_count_cache[3+8*3]=
6420             h->non_zero_count_cache[3+8*4]= h->cbp_table[h->left_mb_xy[1]]&8; //FIXME check MBAFF
6421         }
6422
6423         if(IS_8x8DCT(mb_type)){
6424             h->non_zero_count_cache[scan8[0   ]]= h->non_zero_count_cache[scan8[1   ]]=
6425             h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp & 1;
6426
6427             h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
6428             h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp & 2;
6429
6430             h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
6431             h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp & 4;
6432
6433             h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
6434             h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp & 8;
6435         }
6436     }
6437
6438     if (FRAME_MBAFF
6439             // left mb is in picture
6440             && h->slice_table[mb_xy-1] != 0xFFFF
6441             // and current and left pair do not have the same interlaced type
6442             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6443             // and left mb is in the same slice if deblocking_filter == 2
6444             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6445         /* First vertical edge is different in MBAFF frames
6446          * There are 8 different bS to compute and 2 different Qp
6447          */
6448         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6449         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6450         int16_t bS[8];
6451         int qp[2];
6452         int bqp[2];
6453         int rqp[2];
6454         int mb_qp, mbn0_qp, mbn1_qp;
6455         int i;
6456         first_vertical_edge_done = 1;
6457
6458         if( IS_INTRA(mb_type) )
6459             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6460         else {
6461             for( i = 0; i < 8; i++ ) {
6462                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6463
6464                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6465                     bS[i] = 4;
6466                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6467                          ((!h->pps.cabac && IS_8x8DCT(s->current_picture.mb_type[mbn_xy])) ?
6468                             (h->cbp_table[mbn_xy] & ((MB_FIELD ? (i&2) : (mb_y&1)) ? 8 : 2))
6469                                                                        :
6470                             h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2]))
6471                     bS[i] = 2;
6472                 else
6473                     bS[i] = 1;
6474             }
6475         }
6476
6477         mb_qp = s->current_picture.qscale_table[mb_xy];
6478         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6479         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6480         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6481         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6482                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6483         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6484                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6485         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6486         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6487                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6488         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6489                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6490
6491         /* Filter edge */
6492         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6493         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6494         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6495         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6496         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6497     }
6498
6499 #ifdef CONFIG_SMALL
6500     for( dir = 0; dir < 2; dir++ )
6501         filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, dir ? 0 : first_vertical_edge_done, dir);
6502 #else
6503     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, first_vertical_edge_done, 0);
6504     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, 0, 1);
6505 #endif
6506 }
6507
6508 static int decode_slice(struct AVCodecContext *avctx, void *arg){
6509     H264Context *h = *(void**)arg;
6510     MpegEncContext * const s = &h->s;
6511     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6512
6513     s->mb_skip_run= -1;
6514
6515     h->is_complex = FRAME_MBAFF || s->picture_structure != PICT_FRAME || s->codec_id != CODEC_ID_H264 ||
6516                     (ENABLE_GRAY && (s->flags&CODEC_FLAG_GRAY)) || (ENABLE_H264_ENCODER && s->encoding);
6517
6518     if( h->pps.cabac ) {
6519         int i;
6520
6521         /* realign */
6522         align_get_bits( &s->gb );
6523
6524         /* init cabac */
6525         ff_init_cabac_states( &h->cabac);
6526         ff_init_cabac_decoder( &h->cabac,
6527                                s->gb.buffer + get_bits_count(&s->gb)/8,
6528                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6529         /* calculate pre-state */
6530         for( i= 0; i < 460; i++ ) {
6531             int pre;
6532             if( h->slice_type_nos == FF_I_TYPE )
6533                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6534             else
6535                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6536
6537             if( pre <= 63 )
6538                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6539             else
6540                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6541         }
6542
6543         for(;;){
6544 //START_TIMER
6545             int ret = decode_mb_cabac(h);
6546             int eos;
6547 //STOP_TIMER("decode_mb_cabac")
6548
6549             if(ret>=0) hl_decode_mb(h);
6550
6551             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6552                 s->mb_y++;
6553
6554                 if(ret>=0) ret = decode_mb_cabac(h);
6555
6556                 if(ret>=0) hl_decode_mb(h);
6557                 s->mb_y--;
6558             }
6559             eos = get_cabac_terminate( &h->cabac );
6560
6561             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6562                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6563                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6564                 return -1;
6565             }
6566
6567             if( ++s->mb_x >= s->mb_width ) {
6568                 s->mb_x = 0;
6569                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6570                 ++s->mb_y;
6571                 if(FIELD_OR_MBAFF_PICTURE) {
6572                     ++s->mb_y;
6573                 }
6574             }
6575
6576             if( eos || s->mb_y >= s->mb_height ) {
6577                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6578                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6579                 return 0;
6580             }
6581         }
6582
6583     } else {
6584         for(;;){
6585             int ret = decode_mb_cavlc(h);
6586
6587             if(ret>=0) hl_decode_mb(h);
6588
6589             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6590                 s->mb_y++;
6591                 ret = decode_mb_cavlc(h);
6592
6593                 if(ret>=0) hl_decode_mb(h);
6594                 s->mb_y--;
6595             }
6596
6597             if(ret<0){
6598                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6599                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6600
6601                 return -1;
6602             }
6603
6604             if(++s->mb_x >= s->mb_width){
6605                 s->mb_x=0;
6606                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6607                 ++s->mb_y;
6608                 if(FIELD_OR_MBAFF_PICTURE) {
6609                     ++s->mb_y;
6610                 }
6611                 if(s->mb_y >= s->mb_height){
6612                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6613
6614                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6615                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6616
6617                         return 0;
6618                     }else{
6619                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6620
6621                         return -1;
6622                     }
6623                 }
6624             }
6625
6626             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6627                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6628                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6629                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6630
6631                     return 0;
6632                 }else{
6633                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6634
6635                     return -1;
6636                 }
6637             }
6638         }
6639     }
6640
6641 #if 0
6642     for(;s->mb_y < s->mb_height; s->mb_y++){
6643         for(;s->mb_x < s->mb_width; s->mb_x++){
6644             int ret= decode_mb(h);
6645
6646             hl_decode_mb(h);
6647
6648             if(ret<0){
6649                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6650                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6651
6652                 return -1;
6653             }
6654
6655             if(++s->mb_x >= s->mb_width){
6656                 s->mb_x=0;
6657                 if(++s->mb_y >= s->mb_height){
6658                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6659                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6660
6661                         return 0;
6662                     }else{
6663                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6664
6665                         return -1;
6666                     }
6667                 }
6668             }
6669
6670             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6671                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6672                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6673
6674                     return 0;
6675                 }else{
6676                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6677
6678                     return -1;
6679                 }
6680             }
6681         }
6682         s->mb_x=0;
6683         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6684     }
6685 #endif
6686     return -1; //not reached
6687 }
6688
6689 static int decode_picture_timing(H264Context *h){
6690     MpegEncContext * const s = &h->s;
6691     if(h->sps.nal_hrd_parameters_present_flag || h->sps.vcl_hrd_parameters_present_flag){
6692         skip_bits(&s->gb, h->sps.cpb_removal_delay_length); /* cpb_removal_delay */
6693         skip_bits(&s->gb, h->sps.dpb_output_delay_length);  /* dpb_output_delay */
6694     }
6695     if(h->sps.pic_struct_present_flag){
6696         unsigned int i, num_clock_ts;
6697         h->sei_pic_struct = get_bits(&s->gb, 4);
6698
6699         if (h->sei_pic_struct > SEI_PIC_STRUCT_FRAME_TRIPLING)
6700             return -1;
6701
6702         num_clock_ts = sei_num_clock_ts_table[h->sei_pic_struct];
6703
6704         for (i = 0 ; i < num_clock_ts ; i++){
6705             if(get_bits(&s->gb, 1)){                  /* clock_timestamp_flag */
6706                 unsigned int full_timestamp_flag;
6707                 skip_bits(&s->gb, 2);                 /* ct_type */
6708                 skip_bits(&s->gb, 1);                 /* nuit_field_based_flag */
6709                 skip_bits(&s->gb, 5);                 /* counting_type */
6710                 full_timestamp_flag = get_bits(&s->gb, 1);
6711                 skip_bits(&s->gb, 1);                 /* discontinuity_flag */
6712                 skip_bits(&s->gb, 1);                 /* cnt_dropped_flag */
6713                 skip_bits(&s->gb, 8);                 /* n_frames */
6714                 if(full_timestamp_flag){
6715                     skip_bits(&s->gb, 6);             /* seconds_value 0..59 */
6716                     skip_bits(&s->gb, 6);             /* minutes_value 0..59 */
6717                     skip_bits(&s->gb, 5);             /* hours_value 0..23 */
6718                 }else{
6719                     if(get_bits(&s->gb, 1)){          /* seconds_flag */
6720                         skip_bits(&s->gb, 6);         /* seconds_value range 0..59 */
6721                         if(get_bits(&s->gb, 1)){      /* minutes_flag */
6722                             skip_bits(&s->gb, 6);     /* minutes_value 0..59 */
6723                             if(get_bits(&s->gb, 1))   /* hours_flag */
6724                                 skip_bits(&s->gb, 5); /* hours_value 0..23 */
6725                         }
6726                     }
6727                 }
6728                 if(h->sps.time_offset_length > 0)
6729                     skip_bits(&s->gb, h->sps.time_offset_length); /* time_offset */
6730             }
6731         }
6732     }
6733     return 0;
6734 }
6735
6736 static int decode_unregistered_user_data(H264Context *h, int size){
6737     MpegEncContext * const s = &h->s;
6738     uint8_t user_data[16+256];
6739     int e, build, i;
6740
6741     if(size<16)
6742         return -1;
6743
6744     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6745         user_data[i]= get_bits(&s->gb, 8);
6746     }
6747
6748     user_data[i]= 0;
6749     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6750     if(e==1 && build>=0)
6751         h->x264_build= build;
6752
6753     if(s->avctx->debug & FF_DEBUG_BUGS)
6754         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6755
6756     for(; i<size; i++)
6757         skip_bits(&s->gb, 8);
6758
6759     return 0;
6760 }
6761
6762 static int decode_sei(H264Context *h){
6763     MpegEncContext * const s = &h->s;
6764
6765     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6766         int size, type;
6767
6768         type=0;
6769         do{
6770             type+= show_bits(&s->gb, 8);
6771         }while(get_bits(&s->gb, 8) == 255);
6772
6773         size=0;
6774         do{
6775             size+= show_bits(&s->gb, 8);
6776         }while(get_bits(&s->gb, 8) == 255);
6777
6778         switch(type){
6779         case 1: // Picture timing SEI
6780             if(decode_picture_timing(h) < 0)
6781                 return -1;
6782             break;
6783         case 5:
6784             if(decode_unregistered_user_data(h, size) < 0)
6785                 return -1;
6786             break;
6787         default:
6788             skip_bits(&s->gb, 8*size);
6789         }
6790
6791         //FIXME check bits here
6792         align_get_bits(&s->gb);
6793     }
6794
6795     return 0;
6796 }
6797
6798 static inline int decode_hrd_parameters(H264Context *h, SPS *sps){
6799     MpegEncContext * const s = &h->s;
6800     int cpb_count, i;
6801     cpb_count = get_ue_golomb(&s->gb) + 1;
6802
6803     if(cpb_count > 32U){
6804         av_log(h->s.avctx, AV_LOG_ERROR, "cpb_count %d invalid\n", cpb_count);
6805         return -1;
6806     }
6807
6808     get_bits(&s->gb, 4); /* bit_rate_scale */
6809     get_bits(&s->gb, 4); /* cpb_size_scale */
6810     for(i=0; i<cpb_count; i++){
6811         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
6812         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
6813         get_bits1(&s->gb);     /* cbr_flag */
6814     }
6815     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
6816     sps->cpb_removal_delay_length = get_bits(&s->gb, 5) + 1;
6817     sps->dpb_output_delay_length = get_bits(&s->gb, 5) + 1;
6818     sps->time_offset_length = get_bits(&s->gb, 5);
6819     return 0;
6820 }
6821
6822 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
6823     MpegEncContext * const s = &h->s;
6824     int aspect_ratio_info_present_flag;
6825     unsigned int aspect_ratio_idc;
6826
6827     aspect_ratio_info_present_flag= get_bits1(&s->gb);
6828
6829     if( aspect_ratio_info_present_flag ) {
6830         aspect_ratio_idc= get_bits(&s->gb, 8);
6831         if( aspect_ratio_idc == EXTENDED_SAR ) {
6832             sps->sar.num= get_bits(&s->gb, 16);
6833             sps->sar.den= get_bits(&s->gb, 16);
6834         }else if(aspect_ratio_idc < FF_ARRAY_ELEMS(pixel_aspect)){
6835             sps->sar=  pixel_aspect[aspect_ratio_idc];
6836         }else{
6837             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
6838             return -1;
6839         }
6840     }else{
6841         sps->sar.num=
6842         sps->sar.den= 0;
6843     }
6844 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
6845
6846     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
6847         get_bits1(&s->gb);      /* overscan_appropriate_flag */
6848     }
6849
6850     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
6851         get_bits(&s->gb, 3);    /* video_format */
6852         get_bits1(&s->gb);      /* video_full_range_flag */
6853         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
6854             get_bits(&s->gb, 8); /* colour_primaries */
6855             get_bits(&s->gb, 8); /* transfer_characteristics */
6856             get_bits(&s->gb, 8); /* matrix_coefficients */
6857         }
6858     }
6859
6860     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
6861         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
6862         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
6863     }
6864
6865     sps->timing_info_present_flag = get_bits1(&s->gb);
6866     if(sps->timing_info_present_flag){
6867         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
6868         sps->time_scale = get_bits_long(&s->gb, 32);
6869         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
6870     }
6871
6872     sps->nal_hrd_parameters_present_flag = get_bits1(&s->gb);
6873     if(sps->nal_hrd_parameters_present_flag)
6874         if(decode_hrd_parameters(h, sps) < 0)
6875             return -1;
6876     sps->vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
6877     if(sps->vcl_hrd_parameters_present_flag)
6878         if(decode_hrd_parameters(h, sps) < 0)
6879             return -1;
6880     if(sps->nal_hrd_parameters_present_flag || sps->vcl_hrd_parameters_present_flag)
6881         get_bits1(&s->gb);     /* low_delay_hrd_flag */
6882     sps->pic_struct_present_flag = get_bits1(&s->gb);
6883
6884     sps->bitstream_restriction_flag = get_bits1(&s->gb);
6885     if(sps->bitstream_restriction_flag){
6886         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
6887         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
6888         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
6889         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
6890         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
6891         sps->num_reorder_frames= get_ue_golomb(&s->gb);
6892         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
6893
6894         if(sps->num_reorder_frames > 16U /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
6895             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", sps->num_reorder_frames);
6896             return -1;
6897         }
6898     }
6899
6900     return 0;
6901 }
6902
6903 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
6904                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
6905     MpegEncContext * const s = &h->s;
6906     int i, last = 8, next = 8;
6907     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
6908     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
6909         memcpy(factors, fallback_list, size*sizeof(uint8_t));
6910     else
6911     for(i=0;i<size;i++){
6912         if(next)
6913             next = (last + get_se_golomb(&s->gb)) & 0xff;
6914         if(!i && !next){ /* matrix not written, we use the preset one */
6915             memcpy(factors, jvt_list, size*sizeof(uint8_t));
6916             break;
6917         }
6918         last = factors[scan[i]] = next ? next : last;
6919     }
6920 }
6921
6922 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
6923                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
6924     MpegEncContext * const s = &h->s;
6925     int fallback_sps = !is_sps && sps->scaling_matrix_present;
6926     const uint8_t *fallback[4] = {
6927         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
6928         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
6929         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
6930         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
6931     };
6932     if(get_bits1(&s->gb)){
6933         sps->scaling_matrix_present |= is_sps;
6934         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
6935         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
6936         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
6937         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
6938         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
6939         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
6940         if(is_sps || pps->transform_8x8_mode){
6941             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
6942             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
6943         }
6944     }
6945 }
6946
6947 static inline int decode_seq_parameter_set(H264Context *h){
6948     MpegEncContext * const s = &h->s;
6949     int profile_idc, level_idc;
6950     unsigned int sps_id;
6951     int i;
6952     SPS *sps;
6953
6954     profile_idc= get_bits(&s->gb, 8);
6955     get_bits1(&s->gb);   //constraint_set0_flag
6956     get_bits1(&s->gb);   //constraint_set1_flag
6957     get_bits1(&s->gb);   //constraint_set2_flag
6958     get_bits1(&s->gb);   //constraint_set3_flag
6959     get_bits(&s->gb, 4); // reserved
6960     level_idc= get_bits(&s->gb, 8);
6961     sps_id= get_ue_golomb(&s->gb);
6962
6963     if(sps_id >= MAX_SPS_COUNT) {
6964         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id (%d) out of range\n", sps_id);
6965         return -1;
6966     }
6967     sps= av_mallocz(sizeof(SPS));
6968     if(sps == NULL)
6969         return -1;
6970
6971     sps->profile_idc= profile_idc;
6972     sps->level_idc= level_idc;
6973
6974     memset(sps->scaling_matrix4, 16, sizeof(sps->scaling_matrix4));
6975     memset(sps->scaling_matrix8, 16, sizeof(sps->scaling_matrix8));
6976     sps->scaling_matrix_present = 0;
6977
6978     if(sps->profile_idc >= 100){ //high profile
6979         sps->chroma_format_idc= get_ue_golomb(&s->gb);
6980         if(sps->chroma_format_idc == 3)
6981             get_bits1(&s->gb);  //residual_color_transform_flag
6982         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
6983         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
6984         sps->transform_bypass = get_bits1(&s->gb);
6985         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
6986     }else{
6987         sps->chroma_format_idc= 1;
6988     }
6989
6990     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
6991     sps->poc_type= get_ue_golomb(&s->gb);
6992
6993     if(sps->poc_type == 0){ //FIXME #define
6994         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
6995     } else if(sps->poc_type == 1){//FIXME #define
6996         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
6997         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
6998         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
6999         sps->poc_cycle_length                = get_ue_golomb(&s->gb);
7000
7001         if((unsigned)sps->poc_cycle_length >= FF_ARRAY_ELEMS(sps->offset_for_ref_frame)){
7002             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", sps->poc_cycle_length);
7003             goto fail;
7004         }
7005
7006         for(i=0; i<sps->poc_cycle_length; i++)
7007             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7008     }else if(sps->poc_type != 2){
7009         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7010         goto fail;
7011     }
7012
7013     sps->ref_frame_count= get_ue_golomb(&s->gb);
7014     if(sps->ref_frame_count > MAX_PICTURE_COUNT-2 || sps->ref_frame_count >= 32U){
7015         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7016         goto fail;
7017     }
7018     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7019     sps->mb_width = get_ue_golomb(&s->gb) + 1;
7020     sps->mb_height= get_ue_golomb(&s->gb) + 1;
7021     if((unsigned)sps->mb_width >= INT_MAX/16 || (unsigned)sps->mb_height >= INT_MAX/16 ||
7022        avcodec_check_dimensions(NULL, 16*sps->mb_width, 16*sps->mb_height)){
7023         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7024         goto fail;
7025     }
7026
7027     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7028     if(!sps->frame_mbs_only_flag)
7029         sps->mb_aff= get_bits1(&s->gb);
7030     else
7031         sps->mb_aff= 0;
7032
7033     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7034
7035 #ifndef ALLOW_INTERLACE
7036     if(sps->mb_aff)
7037         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7038 #endif
7039     sps->crop= get_bits1(&s->gb);
7040     if(sps->crop){
7041         sps->crop_left  = get_ue_golomb(&s->gb);
7042         sps->crop_right = get_ue_golomb(&s->gb);
7043         sps->crop_top   = get_ue_golomb(&s->gb);
7044         sps->crop_bottom= get_ue_golomb(&s->gb);
7045         if(sps->crop_left || sps->crop_top){
7046             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7047         }
7048         if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !sps->frame_mbs_only_flag)){
7049             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
7050         }
7051     }else{
7052         sps->crop_left  =
7053         sps->crop_right =
7054         sps->crop_top   =
7055         sps->crop_bottom= 0;
7056     }
7057
7058     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7059     if( sps->vui_parameters_present_flag )
7060         decode_vui_parameters(h, sps);
7061
7062     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7063         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s %s\n",
7064                sps_id, sps->profile_idc, sps->level_idc,
7065                sps->poc_type,
7066                sps->ref_frame_count,
7067                sps->mb_width, sps->mb_height,
7068                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7069                sps->direct_8x8_inference_flag ? "8B8" : "",
7070                sps->crop_left, sps->crop_right,
7071                sps->crop_top, sps->crop_bottom,
7072                sps->vui_parameters_present_flag ? "VUI" : "",
7073                ((const char*[]){"Gray","420","422","444"})[sps->chroma_format_idc]
7074                );
7075     }
7076     av_free(h->sps_buffers[sps_id]);
7077     h->sps_buffers[sps_id]= sps;
7078     return 0;
7079 fail:
7080     av_free(sps);
7081     return -1;
7082 }
7083
7084 static void
7085 build_qp_table(PPS *pps, int t, int index)
7086 {
7087     int i;
7088     for(i = 0; i < 52; i++)
7089         pps->chroma_qp_table[t][i] = chroma_qp[av_clip(i + index, 0, 51)];
7090 }
7091
7092 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7093     MpegEncContext * const s = &h->s;
7094     unsigned int pps_id= get_ue_golomb(&s->gb);
7095     PPS *pps;
7096
7097     if(pps_id >= MAX_PPS_COUNT) {
7098         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id (%d) out of range\n", pps_id);
7099         return -1;
7100     }
7101
7102     pps= av_mallocz(sizeof(PPS));
7103     if(pps == NULL)
7104         return -1;
7105     pps->sps_id= get_ue_golomb(&s->gb);
7106     if((unsigned)pps->sps_id>=MAX_SPS_COUNT || h->sps_buffers[pps->sps_id] == NULL){
7107         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7108         goto fail;
7109     }
7110
7111     pps->cabac= get_bits1(&s->gb);
7112     pps->pic_order_present= get_bits1(&s->gb);
7113     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7114     if(pps->slice_group_count > 1 ){
7115         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7116         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7117         switch(pps->mb_slice_group_map_type){
7118         case 0:
7119 #if 0
7120 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7121 |    run_length[ i ]                                |1  |ue(v)   |
7122 #endif
7123             break;
7124         case 2:
7125 #if 0
7126 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7127 |{                                                  |   |        |
7128 |    top_left_mb[ i ]                               |1  |ue(v)   |
7129 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7130 |   }                                               |   |        |
7131 #endif
7132             break;
7133         case 3:
7134         case 4:
7135         case 5:
7136 #if 0
7137 |   slice_group_change_direction_flag               |1  |u(1)    |
7138 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7139 #endif
7140             break;
7141         case 6:
7142 #if 0
7143 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7144 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7145 |)                                                  |   |        |
7146 |    slice_group_id[ i ]                            |1  |u(v)    |
7147 #endif
7148             break;
7149         }
7150     }
7151     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7152     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7153     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7154         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7155         goto fail;
7156     }
7157
7158     pps->weighted_pred= get_bits1(&s->gb);
7159     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7160     pps->init_qp= get_se_golomb(&s->gb) + 26;
7161     pps->init_qs= get_se_golomb(&s->gb) + 26;
7162     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7163     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7164     pps->constrained_intra_pred= get_bits1(&s->gb);
7165     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7166
7167     pps->transform_8x8_mode= 0;
7168     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7169     memcpy(pps->scaling_matrix4, h->sps_buffers[pps->sps_id]->scaling_matrix4, sizeof(pps->scaling_matrix4));
7170     memcpy(pps->scaling_matrix8, h->sps_buffers[pps->sps_id]->scaling_matrix8, sizeof(pps->scaling_matrix8));
7171
7172     if(get_bits_count(&s->gb) < bit_length){
7173         pps->transform_8x8_mode= get_bits1(&s->gb);
7174         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7175         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7176     } else {
7177         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7178     }
7179
7180     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7181     build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7182     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1])
7183         h->pps.chroma_qp_diff= 1;
7184
7185     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7186         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7187                pps_id, pps->sps_id,
7188                pps->cabac ? "CABAC" : "CAVLC",
7189                pps->slice_group_count,
7190                pps->ref_count[0], pps->ref_count[1],
7191                pps->weighted_pred ? "weighted" : "",
7192                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7193                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7194                pps->constrained_intra_pred ? "CONSTR" : "",
7195                pps->redundant_pic_cnt_present ? "REDU" : "",
7196                pps->transform_8x8_mode ? "8x8DCT" : ""
7197                );
7198     }
7199
7200     av_free(h->pps_buffers[pps_id]);
7201     h->pps_buffers[pps_id]= pps;
7202     return 0;
7203 fail:
7204     av_free(pps);
7205     return -1;
7206 }
7207
7208 /**
7209  * Call decode_slice() for each context.
7210  *
7211  * @param h h264 master context
7212  * @param context_count number of contexts to execute
7213  */
7214 static void execute_decode_slices(H264Context *h, int context_count){
7215     MpegEncContext * const s = &h->s;
7216     AVCodecContext * const avctx= s->avctx;
7217     H264Context *hx;
7218     int i;
7219
7220     if(context_count == 1) {
7221         decode_slice(avctx, &h);
7222     } else {
7223         for(i = 1; i < context_count; i++) {
7224             hx = h->thread_context[i];
7225             hx->s.error_recognition = avctx->error_recognition;
7226             hx->s.error_count = 0;
7227         }
7228
7229         avctx->execute(avctx, (void *)decode_slice,
7230                        (void **)h->thread_context, NULL, context_count, sizeof(void*));
7231
7232         /* pull back stuff from slices to master context */
7233         hx = h->thread_context[context_count - 1];
7234         s->mb_x = hx->s.mb_x;
7235         s->mb_y = hx->s.mb_y;
7236         s->dropable = hx->s.dropable;
7237         s->picture_structure = hx->s.picture_structure;
7238         for(i = 1; i < context_count; i++)
7239             h->s.error_count += h->thread_context[i]->s.error_count;
7240     }
7241 }
7242
7243
7244 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7245     MpegEncContext * const s = &h->s;
7246     AVCodecContext * const avctx= s->avctx;
7247     int buf_index=0;
7248     H264Context *hx; ///< thread context
7249     int context_count = 0;
7250
7251     h->max_contexts = avctx->thread_count;
7252 #if 0
7253     int i;
7254     for(i=0; i<50; i++){
7255         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7256     }
7257 #endif
7258     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7259         h->current_slice = 0;
7260         if (!s->first_field)
7261             s->current_picture_ptr= NULL;
7262     }
7263
7264     for(;;){
7265         int consumed;
7266         int dst_length;
7267         int bit_length;
7268         const uint8_t *ptr;
7269         int i, nalsize = 0;
7270         int err;
7271
7272         if(h->is_avc) {
7273             if(buf_index >= buf_size) break;
7274             nalsize = 0;
7275             for(i = 0; i < h->nal_length_size; i++)
7276                 nalsize = (nalsize << 8) | buf[buf_index++];
7277             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7278                 if(nalsize == 1){
7279                     buf_index++;
7280                     continue;
7281                 }else{
7282                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7283                     break;
7284                 }
7285             }
7286         } else {
7287             // start code prefix search
7288             for(; buf_index + 3 < buf_size; buf_index++){
7289                 // This should always succeed in the first iteration.
7290                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7291                     break;
7292             }
7293
7294             if(buf_index+3 >= buf_size) break;
7295
7296             buf_index+=3;
7297         }
7298
7299         hx = h->thread_context[context_count];
7300
7301         ptr= decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7302         if (ptr==NULL || dst_length < 0){
7303             return -1;
7304         }
7305         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7306             dst_length--;
7307         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
7308
7309         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7310             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7311         }
7312
7313         if (h->is_avc && (nalsize != consumed)){
7314             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7315             consumed= nalsize;
7316         }
7317
7318         buf_index += consumed;
7319
7320         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7321            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7322             continue;
7323
7324       again:
7325         err = 0;
7326         switch(hx->nal_unit_type){
7327         case NAL_IDR_SLICE:
7328             if (h->nal_unit_type != NAL_IDR_SLICE) {
7329                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7330                 return -1;
7331             }
7332             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7333         case NAL_SLICE:
7334             init_get_bits(&hx->s.gb, ptr, bit_length);
7335             hx->intra_gb_ptr=
7336             hx->inter_gb_ptr= &hx->s.gb;
7337             hx->s.data_partitioning = 0;
7338
7339             if((err = decode_slice_header(hx, h)))
7340                break;
7341
7342             s->current_picture_ptr->key_frame|= (hx->nal_unit_type == NAL_IDR_SLICE);
7343             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7344                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7345                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7346                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7347                && avctx->skip_frame < AVDISCARD_ALL)
7348                 context_count++;
7349             break;
7350         case NAL_DPA:
7351             init_get_bits(&hx->s.gb, ptr, bit_length);
7352             hx->intra_gb_ptr=
7353             hx->inter_gb_ptr= NULL;
7354             hx->s.data_partitioning = 1;
7355
7356             err = decode_slice_header(hx, h);
7357             break;
7358         case NAL_DPB:
7359             init_get_bits(&hx->intra_gb, ptr, bit_length);
7360             hx->intra_gb_ptr= &hx->intra_gb;
7361             break;
7362         case NAL_DPC:
7363             init_get_bits(&hx->inter_gb, ptr, bit_length);
7364             hx->inter_gb_ptr= &hx->inter_gb;
7365
7366             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7367                && s->context_initialized
7368                && s->hurry_up < 5
7369                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7370                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7371                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7372                && avctx->skip_frame < AVDISCARD_ALL)
7373                 context_count++;
7374             break;
7375         case NAL_SEI:
7376             init_get_bits(&s->gb, ptr, bit_length);
7377             decode_sei(h);
7378             break;
7379         case NAL_SPS:
7380             init_get_bits(&s->gb, ptr, bit_length);
7381             decode_seq_parameter_set(h);
7382
7383             if(s->flags& CODEC_FLAG_LOW_DELAY)
7384                 s->low_delay=1;
7385
7386             if(avctx->has_b_frames < 2)
7387                 avctx->has_b_frames= !s->low_delay;
7388             break;
7389         case NAL_PPS:
7390             init_get_bits(&s->gb, ptr, bit_length);
7391
7392             decode_picture_parameter_set(h, bit_length);
7393
7394             break;
7395         case NAL_AUD:
7396         case NAL_END_SEQUENCE:
7397         case NAL_END_STREAM:
7398         case NAL_FILLER_DATA:
7399         case NAL_SPS_EXT:
7400         case NAL_AUXILIARY_SLICE:
7401             break;
7402         default:
7403             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7404         }
7405
7406         if(context_count == h->max_contexts) {
7407             execute_decode_slices(h, context_count);
7408             context_count = 0;
7409         }
7410
7411         if (err < 0)
7412             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7413         else if(err == 1) {
7414             /* Slice could not be decoded in parallel mode, copy down
7415              * NAL unit stuff to context 0 and restart. Note that
7416              * rbsp_buffer is not transferred, but since we no longer
7417              * run in parallel mode this should not be an issue. */
7418             h->nal_unit_type = hx->nal_unit_type;
7419             h->nal_ref_idc   = hx->nal_ref_idc;
7420             hx = h;
7421             goto again;
7422         }
7423     }
7424     if(context_count)
7425         execute_decode_slices(h, context_count);
7426     return buf_index;
7427 }
7428
7429 /**
7430  * returns the number of bytes consumed for building the current frame
7431  */
7432 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7433         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7434         if(pos+10>buf_size) pos=buf_size; // oops ;)
7435
7436         return pos;
7437 }
7438
7439 static int decode_frame(AVCodecContext *avctx,
7440                              void *data, int *data_size,
7441                              const uint8_t *buf, int buf_size)
7442 {
7443     H264Context *h = avctx->priv_data;
7444     MpegEncContext *s = &h->s;
7445     AVFrame *pict = data;
7446     int buf_index;
7447
7448     s->flags= avctx->flags;
7449     s->flags2= avctx->flags2;
7450
7451    /* end of stream, output what is still in the buffers */
7452     if (buf_size == 0) {
7453         Picture *out;
7454         int i, out_idx;
7455
7456 //FIXME factorize this with the output code below
7457         out = h->delayed_pic[0];
7458         out_idx = 0;
7459         for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7460             if(h->delayed_pic[i]->poc < out->poc){
7461                 out = h->delayed_pic[i];
7462                 out_idx = i;
7463             }
7464
7465         for(i=out_idx; h->delayed_pic[i]; i++)
7466             h->delayed_pic[i] = h->delayed_pic[i+1];
7467
7468         if(out){
7469             *data_size = sizeof(AVFrame);
7470             *pict= *(AVFrame*)out;
7471         }
7472
7473         return 0;
7474     }
7475
7476     if(h->is_avc && !h->got_avcC) {
7477         int i, cnt, nalsize;
7478         unsigned char *p = avctx->extradata;
7479         if(avctx->extradata_size < 7) {
7480             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7481             return -1;
7482         }
7483         if(*p != 1) {
7484             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7485             return -1;
7486         }
7487         /* sps and pps in the avcC always have length coded with 2 bytes,
7488            so put a fake nal_length_size = 2 while parsing them */
7489         h->nal_length_size = 2;
7490         // Decode sps from avcC
7491         cnt = *(p+5) & 0x1f; // Number of sps
7492         p += 6;
7493         for (i = 0; i < cnt; i++) {
7494             nalsize = AV_RB16(p) + 2;
7495             if(decode_nal_units(h, p, nalsize) < 0) {
7496                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7497                 return -1;
7498             }
7499             p += nalsize;
7500         }
7501         // Decode pps from avcC
7502         cnt = *(p++); // Number of pps
7503         for (i = 0; i < cnt; i++) {
7504             nalsize = AV_RB16(p) + 2;
7505             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7506                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7507                 return -1;
7508             }
7509             p += nalsize;
7510         }
7511         // Now store right nal length size, that will be use to parse all other nals
7512         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7513         // Do not reparse avcC
7514         h->got_avcC = 1;
7515     }
7516
7517     if(!h->got_avcC && !h->is_avc && s->avctx->extradata_size){
7518         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7519             return -1;
7520         h->got_avcC = 1;
7521     }
7522
7523     buf_index=decode_nal_units(h, buf, buf_size);
7524     if(buf_index < 0)
7525         return -1;
7526
7527     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7528         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7529         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7530         return -1;
7531     }
7532
7533     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7534         Picture *out = s->current_picture_ptr;
7535         Picture *cur = s->current_picture_ptr;
7536         int i, pics, cross_idr, out_of_order, out_idx;
7537
7538         s->mb_y= 0;
7539
7540         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7541         s->current_picture_ptr->pict_type= s->pict_type;
7542
7543         if(!s->dropable) {
7544             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7545             h->prev_poc_msb= h->poc_msb;
7546             h->prev_poc_lsb= h->poc_lsb;
7547         }
7548         h->prev_frame_num_offset= h->frame_num_offset;
7549         h->prev_frame_num= h->frame_num;
7550
7551         /*
7552          * FIXME: Error handling code does not seem to support interlaced
7553          * when slices span multiple rows
7554          * The ff_er_add_slice calls don't work right for bottom
7555          * fields; they cause massive erroneous error concealing
7556          * Error marking covers both fields (top and bottom).
7557          * This causes a mismatched s->error_count
7558          * and a bad error table. Further, the error count goes to
7559          * INT_MAX when called for bottom field, because mb_y is
7560          * past end by one (callers fault) and resync_mb_y != 0
7561          * causes problems for the first MB line, too.
7562          */
7563         if (!FIELD_PICTURE)
7564             ff_er_frame_end(s);
7565
7566         MPV_frame_end(s);
7567
7568         if (cur->field_poc[0]==INT_MAX || cur->field_poc[1]==INT_MAX) {
7569             /* Wait for second field. */
7570             *data_size = 0;
7571
7572         } else {
7573             cur->repeat_pict = 0;
7574
7575             /* Signal interlacing information externally. */
7576             /* Prioritize picture timing SEI information over used decoding process if it exists. */
7577             if(h->sps.pic_struct_present_flag){
7578                 switch (h->sei_pic_struct)
7579                 {
7580                 case SEI_PIC_STRUCT_FRAME:
7581                     cur->interlaced_frame = 0;
7582                     break;
7583                 case SEI_PIC_STRUCT_TOP_FIELD:
7584                 case SEI_PIC_STRUCT_BOTTOM_FIELD:
7585                 case SEI_PIC_STRUCT_TOP_BOTTOM:
7586                 case SEI_PIC_STRUCT_BOTTOM_TOP:
7587                     cur->interlaced_frame = 1;
7588                     break;
7589                 case SEI_PIC_STRUCT_TOP_BOTTOM_TOP:
7590                 case SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM:
7591                     // Signal the possibility of telecined film externally (pic_struct 5,6)
7592                     // From these hints, let the applications decide if they apply deinterlacing.
7593                     cur->repeat_pict = 1;
7594                     cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7595                     break;
7596                 case SEI_PIC_STRUCT_FRAME_DOUBLING:
7597                     // Force progressive here, as doubling interlaced frame is a bad idea.
7598                     cur->interlaced_frame = 0;
7599                     cur->repeat_pict = 2;
7600                     break;
7601                 case SEI_PIC_STRUCT_FRAME_TRIPLING:
7602                     cur->interlaced_frame = 0;
7603                     cur->repeat_pict = 4;
7604                     break;
7605                 }
7606             }else{
7607                 /* Derive interlacing flag from used decoding process. */
7608                 cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7609             }
7610
7611             if (cur->field_poc[0] != cur->field_poc[1]){
7612                 /* Derive top_field_first from field pocs. */
7613                 cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7614             }else{
7615                 if(cur->interlaced_frame || h->sps.pic_struct_present_flag){
7616                     /* Use picture timing SEI information. Even if it is a information of a past frame, better than nothing. */
7617                     if(h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM
7618                       || h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM_TOP)
7619                         cur->top_field_first = 1;
7620                     else
7621                         cur->top_field_first = 0;
7622                 }else{
7623                     /* Most likely progressive */
7624                     cur->top_field_first = 0;
7625                 }
7626             }
7627
7628         //FIXME do something with unavailable reference frames
7629
7630             /* Sort B-frames into display order */
7631
7632             if(h->sps.bitstream_restriction_flag
7633                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7634                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7635                 s->low_delay = 0;
7636             }
7637
7638             if(   s->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT
7639                && !h->sps.bitstream_restriction_flag){
7640                 s->avctx->has_b_frames= MAX_DELAYED_PIC_COUNT;
7641                 s->low_delay= 0;
7642             }
7643
7644             pics = 0;
7645             while(h->delayed_pic[pics]) pics++;
7646
7647             assert(pics <= MAX_DELAYED_PIC_COUNT);
7648
7649             h->delayed_pic[pics++] = cur;
7650             if(cur->reference == 0)
7651                 cur->reference = DELAYED_PIC_REF;
7652
7653             out = h->delayed_pic[0];
7654             out_idx = 0;
7655             for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7656                 if(h->delayed_pic[i]->poc < out->poc){
7657                     out = h->delayed_pic[i];
7658                     out_idx = i;
7659                 }
7660             cross_idr = !h->delayed_pic[0]->poc || !!h->delayed_pic[i] || h->delayed_pic[0]->key_frame;
7661
7662             out_of_order = !cross_idr && out->poc < h->outputed_poc;
7663
7664             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7665                 { }
7666             else if((out_of_order && pics-1 == s->avctx->has_b_frames && s->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT)
7667                || (s->low_delay &&
7668                 ((!cross_idr && out->poc > h->outputed_poc + 2)
7669                  || cur->pict_type == FF_B_TYPE)))
7670             {
7671                 s->low_delay = 0;
7672                 s->avctx->has_b_frames++;
7673             }
7674
7675             if(out_of_order || pics > s->avctx->has_b_frames){
7676                 out->reference &= ~DELAYED_PIC_REF;
7677                 for(i=out_idx; h->delayed_pic[i]; i++)
7678                     h->delayed_pic[i] = h->delayed_pic[i+1];
7679             }
7680             if(!out_of_order && pics > s->avctx->has_b_frames){
7681                 *data_size = sizeof(AVFrame);
7682
7683                 h->outputed_poc = out->poc;
7684                 *pict= *(AVFrame*)out;
7685             }else{
7686                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7687             }
7688         }
7689     }
7690
7691     assert(pict->data[0] || !*data_size);
7692     ff_print_debug_info(s, pict);
7693 //printf("out %d\n", (int)pict->data[0]);
7694 #if 0 //?
7695
7696     /* Return the Picture timestamp as the frame number */
7697     /* we subtract 1 because it is added on utils.c     */
7698     avctx->frame_number = s->picture_number - 1;
7699 #endif
7700     return get_consumed_bytes(s, buf_index, buf_size);
7701 }
7702 #if 0
7703 static inline void fill_mb_avail(H264Context *h){
7704     MpegEncContext * const s = &h->s;
7705     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7706
7707     if(s->mb_y){
7708         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7709         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7710         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7711     }else{
7712         h->mb_avail[0]=
7713         h->mb_avail[1]=
7714         h->mb_avail[2]= 0;
7715     }
7716     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7717     h->mb_avail[4]= 1; //FIXME move out
7718     h->mb_avail[5]= 0; //FIXME move out
7719 }
7720 #endif
7721
7722 #ifdef TEST
7723 #undef printf
7724 #undef random
7725 #define COUNT 8000
7726 #define SIZE (COUNT*40)
7727 int main(void){
7728     int i;
7729     uint8_t temp[SIZE];
7730     PutBitContext pb;
7731     GetBitContext gb;
7732 //    int int_temp[10000];
7733     DSPContext dsp;
7734     AVCodecContext avctx;
7735
7736     dsputil_init(&dsp, &avctx);
7737
7738     init_put_bits(&pb, temp, SIZE);
7739     printf("testing unsigned exp golomb\n");
7740     for(i=0; i<COUNT; i++){
7741         START_TIMER
7742         set_ue_golomb(&pb, i);
7743         STOP_TIMER("set_ue_golomb");
7744     }
7745     flush_put_bits(&pb);
7746
7747     init_get_bits(&gb, temp, 8*SIZE);
7748     for(i=0; i<COUNT; i++){
7749         int j, s;
7750
7751         s= show_bits(&gb, 24);
7752
7753         START_TIMER
7754         j= get_ue_golomb(&gb);
7755         if(j != i){
7756             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7757 //            return -1;
7758         }
7759         STOP_TIMER("get_ue_golomb");
7760     }
7761
7762
7763     init_put_bits(&pb, temp, SIZE);
7764     printf("testing signed exp golomb\n");
7765     for(i=0; i<COUNT; i++){
7766         START_TIMER
7767         set_se_golomb(&pb, i - COUNT/2);
7768         STOP_TIMER("set_se_golomb");
7769     }
7770     flush_put_bits(&pb);
7771
7772     init_get_bits(&gb, temp, 8*SIZE);
7773     for(i=0; i<COUNT; i++){
7774         int j, s;
7775
7776         s= show_bits(&gb, 24);
7777
7778         START_TIMER
7779         j= get_se_golomb(&gb);
7780         if(j != i - COUNT/2){
7781             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7782 //            return -1;
7783         }
7784         STOP_TIMER("get_se_golomb");
7785     }
7786
7787 #if 0
7788     printf("testing 4x4 (I)DCT\n");
7789
7790     DCTELEM block[16];
7791     uint8_t src[16], ref[16];
7792     uint64_t error= 0, max_error=0;
7793
7794     for(i=0; i<COUNT; i++){
7795         int j;
7796 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
7797         for(j=0; j<16; j++){
7798             ref[j]= random()%255;
7799             src[j]= random()%255;
7800         }
7801
7802         h264_diff_dct_c(block, src, ref, 4);
7803
7804         //normalize
7805         for(j=0; j<16; j++){
7806 //            printf("%d ", block[j]);
7807             block[j]= block[j]*4;
7808             if(j&1) block[j]= (block[j]*4 + 2)/5;
7809             if(j&4) block[j]= (block[j]*4 + 2)/5;
7810         }
7811 //        printf("\n");
7812
7813         s->dsp.h264_idct_add(ref, block, 4);
7814 /*        for(j=0; j<16; j++){
7815             printf("%d ", ref[j]);
7816         }
7817         printf("\n");*/
7818
7819         for(j=0; j<16; j++){
7820             int diff= FFABS(src[j] - ref[j]);
7821
7822             error+= diff*diff;
7823             max_error= FFMAX(max_error, diff);
7824         }
7825     }
7826     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
7827     printf("testing quantizer\n");
7828     for(qp=0; qp<52; qp++){
7829         for(i=0; i<16; i++)
7830             src1_block[i]= src2_block[i]= random()%255;
7831
7832     }
7833     printf("Testing NAL layer\n");
7834
7835     uint8_t bitstream[COUNT];
7836     uint8_t nal[COUNT*2];
7837     H264Context h;
7838     memset(&h, 0, sizeof(H264Context));
7839
7840     for(i=0; i<COUNT; i++){
7841         int zeros= i;
7842         int nal_length;
7843         int consumed;
7844         int out_length;
7845         uint8_t *out;
7846         int j;
7847
7848         for(j=0; j<COUNT; j++){
7849             bitstream[j]= (random() % 255) + 1;
7850         }
7851
7852         for(j=0; j<zeros; j++){
7853             int pos= random() % COUNT;
7854             while(bitstream[pos] == 0){
7855                 pos++;
7856                 pos %= COUNT;
7857             }
7858             bitstream[pos]=0;
7859         }
7860
7861         START_TIMER
7862
7863         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
7864         if(nal_length<0){
7865             printf("encoding failed\n");
7866             return -1;
7867         }
7868
7869         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
7870
7871         STOP_TIMER("NAL")
7872
7873         if(out_length != COUNT){
7874             printf("incorrect length %d %d\n", out_length, COUNT);
7875             return -1;
7876         }
7877
7878         if(consumed != nal_length){
7879             printf("incorrect consumed length %d %d\n", nal_length, consumed);
7880             return -1;
7881         }
7882
7883         if(memcmp(bitstream, out, COUNT)){
7884             printf("mismatch\n");
7885             return -1;
7886         }
7887     }
7888 #endif
7889
7890     printf("Testing RBSP\n");
7891
7892
7893     return 0;
7894 }
7895 #endif /* TEST */
7896
7897
7898 static av_cold int decode_end(AVCodecContext *avctx)
7899 {
7900     H264Context *h = avctx->priv_data;
7901     MpegEncContext *s = &h->s;
7902     int i;
7903
7904     av_freep(&h->rbsp_buffer[0]);
7905     av_freep(&h->rbsp_buffer[1]);
7906     free_tables(h); //FIXME cleanup init stuff perhaps
7907
7908     for(i = 0; i < MAX_SPS_COUNT; i++)
7909         av_freep(h->sps_buffers + i);
7910
7911     for(i = 0; i < MAX_PPS_COUNT; i++)
7912         av_freep(h->pps_buffers + i);
7913
7914     MPV_common_end(s);
7915
7916 //    memset(h, 0, sizeof(H264Context));
7917
7918     return 0;
7919 }
7920
7921
7922 AVCodec h264_decoder = {
7923     "h264",
7924     CODEC_TYPE_VIDEO,
7925     CODEC_ID_H264,
7926     sizeof(H264Context),
7927     decode_init,
7928     NULL,
7929     decode_end,
7930     decode_frame,
7931     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_DELAY,
7932     .flush= flush_dpb,
7933     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
7934 };
7935
7936 #include "svq3.c"