libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264.h"
  32 #include "h264data.h"
  33 #include "h264_parser.h"
  34 #include "golomb.h"
  35 #include "rectangle.h"
  36
  37 #include "cabac.h"
  38 #ifdef ARCH_X86
  39 #include "x86/h264_i386.h"
  40 #endif
  41
  42 //#undef NDEBUG
  43 #include <assert.h>
  44
  45 /**
  46  * Value of Picture.reference when Picture is not a reference picture, but
  47  * is held for delayed output.
  48  */
  49 #define DELAYED_PIC_REF 4
  50
  51 static VLC coeff_token_vlc[4];
  52 static VLC_TYPE coeff_token_vlc_tables[520+332+280+256][2];
  53 static const int coeff_token_vlc_tables_size[4]={520,332,280,256};
  54
  55 static VLC chroma_dc_coeff_token_vlc;
  56 static VLC_TYPE chroma_dc_coeff_token_vlc_table[256][2];
  57 static const int chroma_dc_coeff_token_vlc_table_size = 256;
  58
  59 static VLC total_zeros_vlc[15];
  60 static VLC_TYPE total_zeros_vlc_tables[15][512][2];
  61 static const int total_zeros_vlc_tables_size = 512;
  62
  63 static VLC chroma_dc_total_zeros_vlc[3];
  64 static VLC_TYPE chroma_dc_total_zeros_vlc_tables[3][8][2];
  65 static const int chroma_dc_total_zeros_vlc_tables_size = 8;
  66
  67 static VLC run_vlc[6];
  68 static VLC_TYPE run_vlc_tables[6][8][2];
  69 static const int run_vlc_tables_size = 8;
  70
  71 static VLC run7_vlc;
  72 static VLC_TYPE run7_vlc_table[96][2];
  73 static const int run7_vlc_table_size = 96;
  74
  75 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  76 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  77 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  78 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  79 static Picture * remove_long(H264Context *h, int i, int ref_mask);
  80
  81 static av_always_inline uint32_t pack16to32(int a, int b){
  82 #ifdef WORDS_BIGENDIAN
  83    return (b&0xFFFF) + (a<<16);
  84 #else
  85    return (a&0xFFFF) + (b<<16);
  86 #endif
  87 }
  88
  89 static const uint8_t rem6[52]={
  90 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  91 };
  92
  93 static const uint8_t div6[52]={
  94 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  95 };
  96
  97 static const int left_block_options[4][8]={
  98     {0,1,2,3,7,10,8,11},
  99     {2,2,3,3,8,11,8,11},
 100     {0,0,1,1,7,10,7,10},
 101     {0,2,0,2,7,10,7,10}
 102 };
 103
 104 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 105     MpegEncContext * const s = &h->s;
 106     const int mb_xy= h->mb_xy;
 107     int topleft_xy, top_xy, topright_xy, left_xy[2];
 108     int topleft_type, top_type, topright_type, left_type[2];
 109     const int * left_block;
 110     int topleft_partition= -1;
 111     int i;
 112
 113     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
 114
 115     //FIXME deblocking could skip the intra and nnz parts.
 116     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
 117         return;
 118
 119     /* Wow, what a mess, why didn't they simplify the interlacing & intra
 120      * stuff, I can't imagine that these complex rules are worth it. */
 121
 122     topleft_xy = top_xy - 1;
 123     topright_xy= top_xy + 1;
 124     left_xy[1] = left_xy[0] = mb_xy-1;
 125     left_block = left_block_options[0];
 126     if(FRAME_MBAFF){
 127         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 128         const int top_pair_xy      = pair_xy     - s->mb_stride;
 129         const int topleft_pair_xy  = top_pair_xy - 1;
 130         const int topright_pair_xy = top_pair_xy + 1;
 131         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 132         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 133         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 134         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 135         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 136         const int bottom = (s->mb_y & 1);
 137         tprintf(s->avctx, "fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 138         if (bottom
 139                 ? !curr_mb_frame_flag // bottom macroblock
 140                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 141                 ) {
 142             top_xy -= s->mb_stride;
 143         }
 144         if (bottom
 145                 ? !curr_mb_frame_flag // bottom macroblock
 146                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 147                 ) {
 148             topleft_xy -= s->mb_stride;
 149         } else if(bottom && curr_mb_frame_flag && !left_mb_frame_flag) {
 150             topleft_xy += s->mb_stride;
 151             // take top left mv from the middle of the mb, as opposed to all other modes which use the bottom right partition
 152             topleft_partition = 0;
 153         }
 154         if (bottom
 155                 ? !curr_mb_frame_flag // bottom macroblock
 156                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 157                 ) {
 158             topright_xy -= s->mb_stride;
 159         }
 160         if (left_mb_frame_flag != curr_mb_frame_flag) {
 161             left_xy[1] = left_xy[0] = pair_xy - 1;
 162             if (curr_mb_frame_flag) {
 163                 if (bottom) {
 164                     left_block = left_block_options[1];
 165                 } else {
 166                     left_block= left_block_options[2];
 167                 }
 168             } else {
 169                 left_xy[1] += s->mb_stride;
 170                 left_block = left_block_options[3];
 171             }
 172         }
 173     }
 174
 175     h->top_mb_xy = top_xy;
 176     h->left_mb_xy[0] = left_xy[0];
 177     h->left_mb_xy[1] = left_xy[1];
 178     if(for_deblock){
 179         topleft_type = 0;
 180         topright_type = 0;
 181         top_type     = h->slice_table[top_xy     ] < 0xFFFF ? s->current_picture.mb_type[top_xy]     : 0;
 182         left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0;
 183         left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0;
 184
 185         if(MB_MBAFF && !IS_INTRA(mb_type)){
 186             int list;
 187             for(list=0; list<h->list_count; list++){
 188                 //These values where changed for ease of performing MC, we need to change them back
 189                 //FIXME maybe we can make MC and loop filter use the same values or prevent
 190                 //the MC code from changing ref_cache and rather use a temporary array.
 191                 if(USES_LIST(mb_type,list)){
 192                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 193                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 194                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 195                     ref += h->b8_stride;
 196                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 197                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 198                 }
 199             }
 200         }
 201     }else{
 202         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 203         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 204         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 205         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 206         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 207
 208     if(IS_INTRA(mb_type)){
 209         int type_mask= h->pps.constrained_intra_pred ? IS_INTRA(-1) : -1;
 210         h->topleft_samples_available=
 211         h->top_samples_available=
 212         h->left_samples_available= 0xFFFF;
 213         h->topright_samples_available= 0xEEEA;
 214
 215         if(!(top_type & type_mask)){
 216             h->topleft_samples_available= 0xB3FF;
 217             h->top_samples_available= 0x33FF;
 218             h->topright_samples_available= 0x26EA;
 219         }
 220         if(IS_INTERLACED(mb_type) != IS_INTERLACED(left_type[0])){
 221             if(IS_INTERLACED(mb_type)){
 222                 if(!(left_type[0] & type_mask)){
 223                     h->topleft_samples_available&= 0xDFFF;
 224                     h->left_samples_available&= 0x5FFF;
 225                 }
 226                 if(!(left_type[1] & type_mask)){
 227                     h->topleft_samples_available&= 0xFF5F;
 228                     h->left_samples_available&= 0xFF5F;
 229                 }
 230             }else{
 231                 int left_typei = h->slice_table[left_xy[0] + s->mb_stride ] == h->slice_num
 232                                 ? s->current_picture.mb_type[left_xy[0] + s->mb_stride] : 0;
 233                 assert(left_xy[0] == left_xy[1]);
 234                 if(!((left_typei & type_mask) && (left_type[0] & type_mask))){
 235                     h->topleft_samples_available&= 0xDF5F;
 236                     h->left_samples_available&= 0x5F5F;
 237                 }
 238             }
 239         }else{
 240             if(!(left_type[0] & type_mask)){
 241                 h->topleft_samples_available&= 0xDF5F;
 242                 h->left_samples_available&= 0x5F5F;
 243             }
 244         }
 245
 246         if(!(topleft_type & type_mask))
 247             h->topleft_samples_available&= 0x7FFF;
 248
 249         if(!(topright_type & type_mask))
 250             h->topright_samples_available&= 0xFBFF;
 251
 252         if(IS_INTRA4x4(mb_type)){
 253             if(IS_INTRA4x4(top_type)){
 254                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 255                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 256                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 257                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 258             }else{
 259                 int pred;
 260                 if(!(top_type & type_mask))
 261                     pred= -1;
 262                 else{
 263                     pred= 2;
 264                 }
 265                 h->intra4x4_pred_mode_cache[4+8*0]=
 266                 h->intra4x4_pred_mode_cache[5+8*0]=
 267                 h->intra4x4_pred_mode_cache[6+8*0]=
 268                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 269             }
 270             for(i=0; i<2; i++){
 271                 if(IS_INTRA4x4(left_type[i])){
 272                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 273                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 274                 }else{
 275                     int pred;
 276                     if(!(left_type[i] & type_mask))
 277                         pred= -1;
 278                     else{
 279                         pred= 2;
 280                     }
 281                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 282                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 283                 }
 284             }
 285         }
 286     }
 287     }
 288
 289
 290 /*
 291 0 . T T. T T T T
 292 1 L . .L . . . .
 293 2 L . .L . . . .
 294 3 . T TL . . . .
 295 4 L . .L . . . .
 296 5 L . .. . . . .
 297 */
 298 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
 299     if(top_type){
 300         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 301         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 302         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 303         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 304
 305         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 306         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 307
 308         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 309         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 310
 311     }else{
 312         h->non_zero_count_cache[4+8*0]=
 313         h->non_zero_count_cache[5+8*0]=
 314         h->non_zero_count_cache[6+8*0]=
 315         h->non_zero_count_cache[7+8*0]=
 316
 317         h->non_zero_count_cache[1+8*0]=
 318         h->non_zero_count_cache[2+8*0]=
 319
 320         h->non_zero_count_cache[1+8*3]=
 321         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 322
 323     }
 324
 325     for (i=0; i<2; i++) {
 326         if(left_type[i]){
 327             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 328             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 329             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 330             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 331         }else{
 332             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 333             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 334             h->non_zero_count_cache[0+8*1 +   8*i]=
 335             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 336         }
 337     }
 338
 339     if( h->pps.cabac ) {
 340         // top_cbp
 341         if(top_type) {
 342             h->top_cbp = h->cbp_table[top_xy];
 343         } else if(IS_INTRA(mb_type)) {
 344             h->top_cbp = 0x1C0;
 345         } else {
 346             h->top_cbp = 0;
 347         }
 348         // left_cbp
 349         if (left_type[0]) {
 350             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 351         } else if(IS_INTRA(mb_type)) {
 352             h->left_cbp = 0x1C0;
 353         } else {
 354             h->left_cbp = 0;
 355         }
 356         if (left_type[0]) {
 357             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 358         }
 359         if (left_type[1]) {
 360             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 361         }
 362     }
 363
 364 #if 1
 365     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 366         int list;
 367         for(list=0; list<h->list_count; list++){
 368             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 369                 /*if(!h->mv_cache_clean[list]){
 370                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 371                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 372                     h->mv_cache_clean[list]= 1;
 373                 }*/
 374                 continue;
 375             }
 376             h->mv_cache_clean[list]= 0;
 377
 378             if(USES_LIST(top_type, list)){
 379                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 380                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 381                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 382                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 383                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 384                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 385                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 386                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 387                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 388                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 389             }else{
 390                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 391                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 392                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 393                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 394                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 395             }
 396
 397             for(i=0; i<2; i++){
 398                 int cache_idx = scan8[0] - 1 + i*2*8;
 399                 if(USES_LIST(left_type[i], list)){
 400                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 401                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 402                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 403                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 404                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 405                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 406                 }else{
 407                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 408                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 409                     h->ref_cache[list][cache_idx  ]=
 410                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 411                 }
 412             }
 413
 414             if(for_deblock || ((IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred) && !FRAME_MBAFF))
 415                 continue;
 416
 417             if(USES_LIST(topleft_type, list)){
 418                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 419                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 420                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 421                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 422             }else{
 423                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 424                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 425             }
 426
 427             if(USES_LIST(topright_type, list)){
 428                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 429                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 430                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 431                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 432             }else{
 433                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 434                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 435             }
 436
 437             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 438                 continue;
 439
 440             h->ref_cache[list][scan8[5 ]+1] =
 441             h->ref_cache[list][scan8[7 ]+1] =
 442             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 443             h->ref_cache[list][scan8[4 ]] =
 444             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 445             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 446             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 447             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 448             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 449             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 450
 451             if( h->pps.cabac ) {
 452                 /* XXX beurk, Load mvd */
 453                 if(USES_LIST(top_type, list)){
 454                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 455                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 456                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 457                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 458                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 459                 }else{
 460                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 461                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 462                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 463                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 464                 }
 465                 if(USES_LIST(left_type[0], list)){
 466                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 467                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 468                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 469                 }else{
 470                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 471                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 472                 }
 473                 if(USES_LIST(left_type[1], list)){
 474                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 475                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 476                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 477                 }else{
 478                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 479                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 480                 }
 481                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 482                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 483                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 484                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 485                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 486
 487                 if(h->slice_type_nos == FF_B_TYPE){
 488                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 489
 490                     if(IS_DIRECT(top_type)){
 491                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 492                     }else if(IS_8X8(top_type)){
 493                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 494                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 495                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 496                     }else{
 497                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 498                     }
 499
 500                     if(IS_DIRECT(left_type[0]))
 501                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 502                     else if(IS_8X8(left_type[0]))
 503                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 504                     else
 505                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 506
 507                     if(IS_DIRECT(left_type[1]))
 508                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 509                     else if(IS_8X8(left_type[1]))
 510                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 511                     else
 512                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 513                 }
 514             }
 515
 516             if(FRAME_MBAFF){
 517 #define MAP_MVS\
 518                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 519                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 520                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 521                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 522                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 523                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 524                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 525                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 526                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 527                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 528                 if(MB_FIELD){
 529 #define MAP_F2F(idx, mb_type)\
 530                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 531                         h->ref_cache[list][idx] <<= 1;\
 532                         h->mv_cache[list][idx][1] /= 2;\
 533                         h->mvd_cache[list][idx][1] /= 2;\
 534                     }
 535                     MAP_MVS
 536 #undef MAP_F2F
 537                 }else{
 538 #define MAP_F2F(idx, mb_type)\
 539                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 540                         h->ref_cache[list][idx] >>= 1;\
 541                         h->mv_cache[list][idx][1] <<= 1;\
 542                         h->mvd_cache[list][idx][1] <<= 1;\
 543                     }
 544                     MAP_MVS
 545 #undef MAP_F2F
 546                 }
 547             }
 548         }
 549     }
 550 #endif
 551
 552     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 553 }
 554
 555 static inline void write_back_intra_pred_mode(H264Context *h){
 556     const int mb_xy= h->mb_xy;
 557
 558     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 559     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 560     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 561     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 562     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 563     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 564     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 565 }
 566
 567 /**
 568  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 569  */
 570 static inline int check_intra4x4_pred_mode(H264Context *h){
 571     MpegEncContext * const s = &h->s;
 572     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 573     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 574     int i;
 575
 576     if(!(h->top_samples_available&0x8000)){
 577         for(i=0; i<4; i++){
 578             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 579             if(status<0){
 580                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 581                 return -1;
 582             } else if(status){
 583                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 584             }
 585         }
 586     }
 587
 588     if((h->left_samples_available&0x8888)!=0x8888){
 589         static const int mask[4]={0x8000,0x2000,0x80,0x20};
 590         for(i=0; i<4; i++){
 591             if(!(h->left_samples_available&mask[i])){
 592             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 593             if(status<0){
 594                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 595                 return -1;
 596             } else if(status){
 597                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 598             }
 599             }
 600         }
 601     }
 602
 603     return 0;
 604 } //FIXME cleanup like next
 605
 606 /**
 607  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 608  */
 609 static inline int check_intra_pred_mode(H264Context *h, int mode){
 610     MpegEncContext * const s = &h->s;
 611     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 612     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 613
 614     if(mode > 6U) {
 615         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 616         return -1;
 617     }
 618
 619     if(!(h->top_samples_available&0x8000)){
 620         mode= top[ mode ];
 621         if(mode<0){
 622             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 623             return -1;
 624         }
 625     }
 626
 627     if((h->left_samples_available&0x8080) != 0x8080){
 628         mode= left[ mode ];
 629         if(h->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred
 630             mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(h->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8);
 631         }
 632         if(mode<0){
 633             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 634             return -1;
 635         }
 636     }
 637
 638     return mode;
 639 }
 640
 641 /**
 642  * gets the predicted intra4x4 prediction mode.
 643  */
 644 static inline int pred_intra_mode(H264Context *h, int n){
 645     const int index8= scan8[n];
 646     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 647     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 648     const int min= FFMIN(left, top);
 649
 650     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 651
 652     if(min<0) return DC_PRED;
 653     else      return min;
 654 }
 655
 656 static inline void write_back_non_zero_count(H264Context *h){
 657     const int mb_xy= h->mb_xy;
 658
 659     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 660     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 661     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 662     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 663     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 664     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 665     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 666
 667     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 668     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 669     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 670
 671     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 672     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 673     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 674 }
 675
 676 /**
 677  * gets the predicted number of non-zero coefficients.
 678  * @param n block index
 679  */
 680 static inline int pred_non_zero_count(H264Context *h, int n){
 681     const int index8= scan8[n];
 682     const int left= h->non_zero_count_cache[index8 - 1];
 683     const int top = h->non_zero_count_cache[index8 - 8];
 684     int i= left + top;
 685
 686     if(i<64) i= (i+1)>>1;
 687
 688     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 689
 690     return i&31;
 691 }
 692
 693 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 694     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 695     MpegEncContext *s = &h->s;
 696
 697     /* there is no consistent mapping of mvs to neighboring locations that will
 698      * make mbaff happy, so we can't move all this logic to fill_caches */
 699     if(FRAME_MBAFF){
 700         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 701         const int16_t *mv;
 702         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 703         *C = h->mv_cache[list][scan8[0]-2];
 704
 705         if(!MB_FIELD
 706            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 707             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 708             if(IS_INTERLACED(mb_types[topright_xy])){
 709 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 710                 const int x4 = X4, y4 = Y4;\
 711                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 712                 if(!USES_LIST(mb_type,list))\
 713                     return LIST_NOT_USED;\
 714                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 715                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 716                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 717                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 718
 719                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 720             }
 721         }
 722         if(topright_ref == PART_NOT_AVAILABLE
 723            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 724            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 725             if(!MB_FIELD
 726                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 727                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 728             }
 729             if(MB_FIELD
 730                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 731                && i >= scan8[0]+8){
 732                 // left shift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's OK.
 733                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 734             }
 735         }
 736 #undef SET_DIAG_MV
 737     }
 738
 739     if(topright_ref != PART_NOT_AVAILABLE){
 740         *C= h->mv_cache[list][ i - 8 + part_width ];
 741         return topright_ref;
 742     }else{
 743         tprintf(s->avctx, "topright MV not available\n");
 744
 745         *C= h->mv_cache[list][ i - 8 - 1 ];
 746         return h->ref_cache[list][ i - 8 - 1 ];
 747     }
 748 }
 749
 750 /**
 751  * gets the predicted MV.
 752  * @param n the block index
 753  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 754  * @param mx the x component of the predicted motion vector
 755  * @param my the y component of the predicted motion vector
 756  */
 757 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 758     const int index8= scan8[n];
 759     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 760     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 761     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 762     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 763     const int16_t * C;
 764     int diagonal_ref, match_count;
 765
 766     assert(part_width==1 || part_width==2 || part_width==4);
 767
 768 /* mv_cache
 769   B . . A T T T T
 770   U . . L . . , .
 771   U . . L . . . .
 772   U . . L . . , .
 773   . . . L . . . .
 774 */
 775
 776     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 777     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 778     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 779     if(match_count > 1){ //most common
 780         *mx= mid_pred(A[0], B[0], C[0]);
 781         *my= mid_pred(A[1], B[1], C[1]);
 782     }else if(match_count==1){
 783         if(left_ref==ref){
 784             *mx= A[0];
 785             *my= A[1];
 786         }else if(top_ref==ref){
 787             *mx= B[0];
 788             *my= B[1];
 789         }else{
 790             *mx= C[0];
 791             *my= C[1];
 792         }
 793     }else{
 794         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 795             *mx= A[0];
 796             *my= A[1];
 797         }else{
 798             *mx= mid_pred(A[0], B[0], C[0]);
 799             *my= mid_pred(A[1], B[1], C[1]);
 800         }
 801     }
 802
 803     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 804 }
 805
 806 /**
 807  * gets the directionally predicted 16x8 MV.
 808  * @param n the block index
 809  * @param mx the x component of the predicted motion vector
 810  * @param my the y component of the predicted motion vector
 811  */
 812 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 813     if(n==0){
 814         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 815         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 816
 817         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 818
 819         if(top_ref == ref){
 820             *mx= B[0];
 821             *my= B[1];
 822             return;
 823         }
 824     }else{
 825         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 826         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 827
 828         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 829
 830         if(left_ref == ref){
 831             *mx= A[0];
 832             *my= A[1];
 833             return;
 834         }
 835     }
 836
 837     //RARE
 838     pred_motion(h, n, 4, list, ref, mx, my);
 839 }
 840
 841 /**
 842  * gets the directionally predicted 8x16 MV.
 843  * @param n the block index
 844  * @param mx the x component of the predicted motion vector
 845  * @param my the y component of the predicted motion vector
 846  */
 847 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 848     if(n==0){
 849         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 850         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 851
 852         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 853
 854         if(left_ref == ref){
 855             *mx= A[0];
 856             *my= A[1];
 857             return;
 858         }
 859     }else{
 860         const int16_t * C;
 861         int diagonal_ref;
 862
 863         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 864
 865         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 866
 867         if(diagonal_ref == ref){
 868             *mx= C[0];
 869             *my= C[1];
 870             return;
 871         }
 872     }
 873
 874     //RARE
 875     pred_motion(h, n, 2, list, ref, mx, my);
 876 }
 877
 878 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 879     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 880     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 881
 882     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 883
 884     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 885        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
 886        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
 887
 888         *mx = *my = 0;
 889         return;
 890     }
 891
 892     pred_motion(h, 0, 4, 0, 0, mx, my);
 893
 894     return;
 895 }
 896
 897 static int get_scale_factor(H264Context * const h, int poc, int poc1, int i){
 898     int poc0 = h->ref_list[0][i].poc;
 899     int td = av_clip(poc1 - poc0, -128, 127);
 900     if(td == 0 || h->ref_list[0][i].long_ref){
 901         return 256;
 902     }else{
 903         int tb = av_clip(poc - poc0, -128, 127);
 904         int tx = (16384 + (FFABS(td) >> 1)) / td;
 905         return av_clip((tb*tx + 32) >> 6, -1024, 1023);
 906     }
 907 }
 908
 909 static inline void direct_dist_scale_factor(H264Context * const h){
 910     MpegEncContext * const s = &h->s;
 911     const int poc = h->s.current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
 912     const int poc1 = h->ref_list[1][0].poc;
 913     int i, field;
 914     for(field=0; field<2; field++){
 915         const int poc  = h->s.current_picture_ptr->field_poc[field];
 916         const int poc1 = h->ref_list[1][0].field_poc[field];
 917         for(i=0; i < 2*h->ref_count[0]; i++)
 918             h->dist_scale_factor_field[field][i^field] = get_scale_factor(h, poc, poc1, i+16);
 919     }
 920
 921     for(i=0; i<h->ref_count[0]; i++){
 922         h->dist_scale_factor[i] = get_scale_factor(h, poc, poc1, i);
 923     }
 924 }
 925
 926 static void fill_colmap(H264Context *h, int map[2][16+32], int list, int field, int colfield, int mbafi){
 927     MpegEncContext * const s = &h->s;
 928     Picture * const ref1 = &h->ref_list[1][0];
 929     int j, old_ref, rfield;
 930     int start= mbafi ? 16                      : 0;
 931     int end  = mbafi ? 16+2*h->ref_count[list] : h->ref_count[list];
 932     int interl= mbafi || s->picture_structure != PICT_FRAME;
 933
 934     /* bogus; fills in for missing frames */
 935     memset(map[list], 0, sizeof(map[list]));
 936
 937     for(rfield=0; rfield<2; rfield++){
 938         for(old_ref=0; old_ref<ref1->ref_count[colfield][list]; old_ref++){
 939             int poc = ref1->ref_poc[colfield][list][old_ref];
 940
 941             if     (!interl)
 942                 poc |= 3;
 943             else if( interl && (poc&3) == 3) //FIXME store all MBAFF references so this isnt needed
 944                 poc= (poc&~3) + rfield + 1;
 945
 946             for(j=start; j<end; j++){
 947                 if(4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3) == poc){
 948                     int cur_ref= mbafi ? (j-16)^field : j;
 949                     map[list][2*old_ref + (rfield^field) + 16] = cur_ref;
 950                     if(rfield == field)
 951                         map[list][old_ref] = cur_ref;
 952                     break;
 953                 }
 954             }
 955         }
 956     }
 957 }
 958
 959 static inline void direct_ref_list_init(H264Context * const h){
 960     MpegEncContext * const s = &h->s;
 961     Picture * const ref1 = &h->ref_list[1][0];
 962     Picture * const cur = s->current_picture_ptr;
 963     int list, j, field;
 964     int sidx= (s->picture_structure&1)^1;
 965     int ref1sidx= (ref1->reference&1)^1;
 966
 967     for(list=0; list<2; list++){
 968         cur->ref_count[sidx][list] = h->ref_count[list];
 969         for(j=0; j<h->ref_count[list]; j++)
 970             cur->ref_poc[sidx][list][j] = 4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3);
 971     }
 972
 973     if(s->picture_structure == PICT_FRAME){
 974         memcpy(cur->ref_count[1], cur->ref_count[0], sizeof(cur->ref_count[0]));
 975         memcpy(cur->ref_poc  [1], cur->ref_poc  [0], sizeof(cur->ref_poc  [0]));
 976     }
 977
 978     cur->mbaff= FRAME_MBAFF;
 979
 980     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 981         return;
 982
 983     for(list=0; list<2; list++){
 984         fill_colmap(h, h->map_col_to_list0, list, sidx, ref1sidx, 0);
 985         for(field=0; field<2; field++)
 986             fill_colmap(h, h->map_col_to_list0_field[field], list, field, field, 1);
 987     }
 988 }
 989
 990 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 991     MpegEncContext * const s = &h->s;
 992     int b8_stride = h->b8_stride;
 993     int b4_stride = h->b_stride;
 994     int mb_xy = h->mb_xy;
 995     int mb_type_col[2];
 996     const int16_t (*l1mv0)[2], (*l1mv1)[2];
 997     const int8_t *l1ref0, *l1ref1;
 998     const int is_b8x8 = IS_8X8(*mb_type);
 999     unsigned int sub_mb_type;
1000     int i8, i4;
1001
1002 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
1003
1004     if(IS_INTERLACED(h->ref_list[1][0].mb_type[mb_xy])){ // AFL/AFR/FR/FL -> AFL/FL
1005         if(!IS_INTERLACED(*mb_type)){                    //     AFR/FR    -> AFL/FL
1006             int cur_poc = s->current_picture_ptr->poc;
1007             int *col_poc = h->ref_list[1]->field_poc;
1008             int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
1009             mb_xy= s->mb_x + ((s->mb_y&~1) + col_parity)*s->mb_stride;
1010             b8_stride = 0;
1011         }else if(!(s->picture_structure & h->ref_list[1][0].reference) && !h->ref_list[1][0].mbaff){// FL -> FL & differ parity
1012             int fieldoff= 2*(h->ref_list[1][0].reference)-3;
1013             mb_xy += s->mb_stride*fieldoff;
1014         }
1015         goto single_col;
1016     }else{                                               // AFL/AFR/FR/FL -> AFR/FR
1017         if(IS_INTERLACED(*mb_type)){                     // AFL       /FL -> AFR/FR
1018             mb_xy= s->mb_x + (s->mb_y&~1)*s->mb_stride;
1019             mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy];
1020             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride];
1021             b8_stride *= 3;
1022             b4_stride *= 6;
1023             //FIXME IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag
1024             if(    (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)
1025                 && (mb_type_col[1] & MB_TYPE_16x16_OR_INTRA)
1026                 && !is_b8x8){
1027                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1028                 *mb_type   |= MB_TYPE_16x8 |MB_TYPE_L0L1|MB_TYPE_DIRECT2; /* B_16x8 */
1029             }else{
1030                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1031                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1032             }
1033         }else{                                           //     AFR/FR    -> AFR/FR
1034 single_col:
1035             mb_type_col[0] =
1036             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy];
1037             if(IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag){
1038                 /* FIXME save sub mb types from previous frames (or derive from MVs)
1039                 * so we know exactly what block size to use */
1040                 sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1041                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1042             }else if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
1043                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1044                 *mb_type   |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1045             }else{
1046                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1047                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1048             }
1049         }
1050     }
1051
1052     l1mv0  = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]];
1053     l1mv1  = &h->ref_list[1][0].motion_val[1][h->mb2b_xy [mb_xy]];
1054     l1ref0 = &h->ref_list[1][0].ref_index [0][h->mb2b8_xy[mb_xy]];
1055     l1ref1 = &h->ref_list[1][0].ref_index [1][h->mb2b8_xy[mb_xy]];
1056     if(!b8_stride){
1057         if(s->mb_y&1){
1058             l1ref0 += h->b8_stride;
1059             l1ref1 += h->b8_stride;
1060             l1mv0  +=  2*b4_stride;
1061             l1mv1  +=  2*b4_stride;
1062         }
1063     }
1064
1065     if(h->direct_spatial_mv_pred){
1066         int ref[2];
1067         int mv[2][2];
1068         int list;
1069
1070         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1071
1072         /* ref = min(neighbors) */
1073         for(list=0; list<2; list++){
1074             int refa = h->ref_cache[list][scan8[0] - 1];
1075             int refb = h->ref_cache[list][scan8[0] - 8];
1076             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1077             if(refc == PART_NOT_AVAILABLE)
1078                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1079             ref[list] = FFMIN3((unsigned)refa, (unsigned)refb, (unsigned)refc);
1080             if(ref[list] < 0)
1081                 ref[list] = -1;
1082         }
1083
1084         if(ref[0] < 0 && ref[1] < 0){
1085             ref[0] = ref[1] = 0;
1086             mv[0][0] = mv[0][1] =
1087             mv[1][0] = mv[1][1] = 0;
1088         }else{
1089             for(list=0; list<2; list++){
1090                 if(ref[list] >= 0)
1091                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1092                 else
1093                     mv[list][0] = mv[list][1] = 0;
1094             }
1095         }
1096
1097         if(ref[1] < 0){
1098             if(!is_b8x8)
1099                 *mb_type &= ~MB_TYPE_L1;
1100             sub_mb_type &= ~MB_TYPE_L1;
1101         }else if(ref[0] < 0){
1102             if(!is_b8x8)
1103                 *mb_type &= ~MB_TYPE_L0;
1104             sub_mb_type &= ~MB_TYPE_L0;
1105         }
1106
1107         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1108             for(i8=0; i8<4; i8++){
1109                 int x8 = i8&1;
1110                 int y8 = i8>>1;
1111                 int xy8 = x8+y8*b8_stride;
1112                 int xy4 = 3*x8+y8*b4_stride;
1113                 int a=0, b=0;
1114
1115                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1116                     continue;
1117                 h->sub_mb_type[i8] = sub_mb_type;
1118
1119                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1120                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1121                 if(!IS_INTRA(mb_type_col[y8])
1122                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1123                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1124                     if(ref[0] > 0)
1125                         a= pack16to32(mv[0][0],mv[0][1]);
1126                     if(ref[1] > 0)
1127                         b= pack16to32(mv[1][0],mv[1][1]);
1128                 }else{
1129                     a= pack16to32(mv[0][0],mv[0][1]);
1130                     b= pack16to32(mv[1][0],mv[1][1]);
1131                 }
1132                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1133                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1134             }
1135         }else if(IS_16X16(*mb_type)){
1136             int a=0, b=0;
1137
1138             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1139             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1140             if(!IS_INTRA(mb_type_col[0])
1141                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1142                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1143                        && (h->x264_build>33 || !h->x264_build)))){
1144                 if(ref[0] > 0)
1145                     a= pack16to32(mv[0][0],mv[0][1]);
1146                 if(ref[1] > 0)
1147                     b= pack16to32(mv[1][0],mv[1][1]);
1148             }else{
1149                 a= pack16to32(mv[0][0],mv[0][1]);
1150                 b= pack16to32(mv[1][0],mv[1][1]);
1151             }
1152             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1153             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1154         }else{
1155             for(i8=0; i8<4; i8++){
1156                 const int x8 = i8&1;
1157                 const int y8 = i8>>1;
1158
1159                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1160                     continue;
1161                 h->sub_mb_type[i8] = sub_mb_type;
1162
1163                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1164                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1165                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1166                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1167
1168                 /* col_zero_flag */
1169                 if(!IS_INTRA(mb_type_col[0]) && (   l1ref0[x8 + y8*b8_stride] == 0
1170                                               || (l1ref0[x8 + y8*b8_stride] < 0 && l1ref1[x8 + y8*b8_stride] == 0
1171                                                   && (h->x264_build>33 || !h->x264_build)))){
1172                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*b8_stride] == 0 ? l1mv0 : l1mv1;
1173                     if(IS_SUB_8X8(sub_mb_type)){
1174                         const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1175                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1176                             if(ref[0] == 0)
1177                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1178                             if(ref[1] == 0)
1179                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1180                         }
1181                     }else
1182                     for(i4=0; i4<4; i4++){
1183                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1184                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1185                             if(ref[0] == 0)
1186                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1187                             if(ref[1] == 0)
1188                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1189                         }
1190                     }
1191                 }
1192             }
1193         }
1194     }else{ /* direct temporal mv pred */
1195         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1196         const int *dist_scale_factor = h->dist_scale_factor;
1197         int ref_offset= 0;
1198
1199         if(FRAME_MBAFF && IS_INTERLACED(*mb_type)){
1200             map_col_to_list0[0] = h->map_col_to_list0_field[s->mb_y&1][0];
1201             map_col_to_list0[1] = h->map_col_to_list0_field[s->mb_y&1][1];
1202             dist_scale_factor   =h->dist_scale_factor_field[s->mb_y&1];
1203         }
1204         if(h->ref_list[1][0].mbaff && IS_INTERLACED(mb_type_col[0]))
1205             ref_offset += 16;
1206
1207         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1208             /* FIXME assumes direct_8x8_inference == 1 */
1209             int y_shift  = 2*!IS_INTERLACED(*mb_type);
1210
1211             for(i8=0; i8<4; i8++){
1212                 const int x8 = i8&1;
1213                 const int y8 = i8>>1;
1214                 int ref0, scale;
1215                 const int16_t (*l1mv)[2]= l1mv0;
1216
1217                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1218                     continue;
1219                 h->sub_mb_type[i8] = sub_mb_type;
1220
1221                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1222                 if(IS_INTRA(mb_type_col[y8])){
1223                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1224                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1225                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1226                     continue;
1227                 }
1228
1229                 ref0 = l1ref0[x8 + y8*b8_stride];
1230                 if(ref0 >= 0)
1231                     ref0 = map_col_to_list0[0][ref0 + ref_offset];
1232                 else{
1233                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1234                     l1mv= l1mv1;
1235                 }
1236                 scale = dist_scale_factor[ref0];
1237                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1238
1239                 {
1240                     const int16_t *mv_col = l1mv[x8*3 + y8*b4_stride];
1241                     int my_col = (mv_col[1]<<y_shift)/2;
1242                     int mx = (scale * mv_col[0] + 128) >> 8;
1243                     int my = (scale * my_col + 128) >> 8;
1244                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1245                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1246                 }
1247             }
1248             return;
1249         }
1250
1251         /* one-to-one mv scaling */
1252
1253         if(IS_16X16(*mb_type)){
1254             int ref, mv0, mv1;
1255
1256             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1257             if(IS_INTRA(mb_type_col[0])){
1258                 ref=mv0=mv1=0;
1259             }else{
1260                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0] + ref_offset]
1261                                                 : map_col_to_list0[1][l1ref1[0] + ref_offset];
1262                 const int scale = dist_scale_factor[ref0];
1263                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1264                 int mv_l0[2];
1265                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1266                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1267                 ref= ref0;
1268                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1269                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1270             }
1271             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1272             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1273             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1274         }else{
1275             for(i8=0; i8<4; i8++){
1276                 const int x8 = i8&1;
1277                 const int y8 = i8>>1;
1278                 int ref0, scale;
1279                 const int16_t (*l1mv)[2]= l1mv0;
1280
1281                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1282                     continue;
1283                 h->sub_mb_type[i8] = sub_mb_type;
1284                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1285                 if(IS_INTRA(mb_type_col[0])){
1286                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1287                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1288                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1289                     continue;
1290                 }
1291
1292                 ref0 = l1ref0[x8 + y8*b8_stride] + ref_offset;
1293                 if(ref0 >= 0)
1294                     ref0 = map_col_to_list0[0][ref0];
1295                 else{
1296                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1297                     l1mv= l1mv1;
1298                 }
1299                 scale = dist_scale_factor[ref0];
1300
1301                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1302                 if(IS_SUB_8X8(sub_mb_type)){
1303                     const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1304                     int mx = (scale * mv_col[0] + 128) >> 8;
1305                     int my = (scale * mv_col[1] + 128) >> 8;
1306                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1307                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1308                 }else
1309                 for(i4=0; i4<4; i4++){
1310                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1311                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1312                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1313                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1314                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1315                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1316                 }
1317             }
1318         }
1319     }
1320 }
1321
1322 static inline void write_back_motion(H264Context *h, int mb_type){
1323     MpegEncContext * const s = &h->s;
1324     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1325     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1326     int list;
1327
1328     if(!USES_LIST(mb_type, 0))
1329         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1330
1331     for(list=0; list<h->list_count; list++){
1332         int y;
1333         if(!USES_LIST(mb_type, list))
1334             continue;
1335
1336         for(y=0; y<4; y++){
1337             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1338             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1339         }
1340         if( h->pps.cabac ) {
1341             if(IS_SKIP(mb_type))
1342                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1343             else
1344             for(y=0; y<4; y++){
1345                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1346                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1347             }
1348         }
1349
1350         {
1351             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1352             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1353             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1354             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1355             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1356         }
1357     }
1358
1359     if(h->slice_type_nos == FF_B_TYPE && h->pps.cabac){
1360         if(IS_8X8(mb_type)){
1361             uint8_t *direct_table = &h->direct_table[b8_xy];
1362             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1363             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1364             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1365         }
1366     }
1367 }
1368
1369 /**
1370  * Decodes a network abstraction layer unit.
1371  * @param consumed is the number of bytes used as input
1372  * @param length is the length of the array
1373  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1374  * @returns decoded bytes, might be src+1 if no escapes
1375  */
1376 static const uint8_t *decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1377     int i, si, di;
1378     uint8_t *dst;
1379     int bufidx;
1380
1381 //    src[0]&0x80;                //forbidden bit
1382     h->nal_ref_idc= src[0]>>5;
1383     h->nal_unit_type= src[0]&0x1F;
1384
1385     src++; length--;
1386 #if 0
1387     for(i=0; i<length; i++)
1388         printf("%2X ", src[i]);
1389 #endif
1390     for(i=0; i+1<length; i+=2){
1391         if(src[i]) continue;
1392         if(i>0 && src[i-1]==0) i--;
1393         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1394             if(src[i+2]!=3){
1395                 /* startcode, so we must be past the end */
1396                 length=i;
1397             }
1398             break;
1399         }
1400     }
1401
1402     if(i>=length-1){ //no escaped 0
1403         *dst_length= length;
1404         *consumed= length+1; //+1 for the header
1405         return src;
1406     }
1407
1408     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1409     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length+FF_INPUT_BUFFER_PADDING_SIZE);
1410     dst= h->rbsp_buffer[bufidx];
1411
1412     if (dst == NULL){
1413         return NULL;
1414     }
1415
1416 //printf("decoding esc\n");
1417     si=di=0;
1418     while(si<length){
1419         //remove escapes (very rare 1:2^22)
1420         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1421             if(src[si+2]==3){ //escape
1422                 dst[di++]= 0;
1423                 dst[di++]= 0;
1424                 si+=3;
1425                 continue;
1426             }else //next start code
1427                 break;
1428         }
1429
1430         dst[di++]= src[si++];
1431     }
1432
1433     memset(dst+di, 0, FF_INPUT_BUFFER_PADDING_SIZE);
1434
1435     *dst_length= di;
1436     *consumed= si + 1;//+1 for the header
1437 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1438     return dst;
1439 }
1440
1441 /**
1442  * identifies the exact end of the bitstream
1443  * @return the length of the trailing, or 0 if damaged
1444  */
1445 static int decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1446     int v= *src;
1447     int r;
1448
1449     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1450
1451     for(r=1; r<9; r++){
1452         if(v&1) return r;
1453         v>>=1;
1454     }
1455     return 0;
1456 }
1457
1458 /**
1459  * IDCT transforms the 16 dc values and dequantizes them.
1460  * @param qp quantization parameter
1461  */
1462 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1463 #define stride 16
1464     int i;
1465     int temp[16]; //FIXME check if this is a good idea
1466     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1467     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1468
1469 //memset(block, 64, 2*256);
1470 //return;
1471     for(i=0; i<4; i++){
1472         const int offset= y_offset[i];
1473         const int z0= block[offset+stride*0] + block[offset+stride*4];
1474         const int z1= block[offset+stride*0] - block[offset+stride*4];
1475         const int z2= block[offset+stride*1] - block[offset+stride*5];
1476         const int z3= block[offset+stride*1] + block[offset+stride*5];
1477
1478         temp[4*i+0]= z0+z3;
1479         temp[4*i+1]= z1+z2;
1480         temp[4*i+2]= z1-z2;
1481         temp[4*i+3]= z0-z3;
1482     }
1483
1484     for(i=0; i<4; i++){
1485         const int offset= x_offset[i];
1486         const int z0= temp[4*0+i] + temp[4*2+i];
1487         const int z1= temp[4*0+i] - temp[4*2+i];
1488         const int z2= temp[4*1+i] - temp[4*3+i];
1489         const int z3= temp[4*1+i] + temp[4*3+i];
1490
1491         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
1492         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1493         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1494         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1495     }
1496 }
1497
1498 #if 0
1499 /**
1500  * DCT transforms the 16 dc values.
1501  * @param qp quantization parameter ??? FIXME
1502  */
1503 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1504 //    const int qmul= dequant_coeff[qp][0];
1505     int i;
1506     int temp[16]; //FIXME check if this is a good idea
1507     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1508     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1509
1510     for(i=0; i<4; i++){
1511         const int offset= y_offset[i];
1512         const int z0= block[offset+stride*0] + block[offset+stride*4];
1513         const int z1= block[offset+stride*0] - block[offset+stride*4];
1514         const int z2= block[offset+stride*1] - block[offset+stride*5];
1515         const int z3= block[offset+stride*1] + block[offset+stride*5];
1516
1517         temp[4*i+0]= z0+z3;
1518         temp[4*i+1]= z1+z2;
1519         temp[4*i+2]= z1-z2;
1520         temp[4*i+3]= z0-z3;
1521     }
1522
1523     for(i=0; i<4; i++){
1524         const int offset= x_offset[i];
1525         const int z0= temp[4*0+i] + temp[4*2+i];
1526         const int z1= temp[4*0+i] - temp[4*2+i];
1527         const int z2= temp[4*1+i] - temp[4*3+i];
1528         const int z3= temp[4*1+i] + temp[4*3+i];
1529
1530         block[stride*0 +offset]= (z0 + z3)>>1;
1531         block[stride*2 +offset]= (z1 + z2)>>1;
1532         block[stride*8 +offset]= (z1 - z2)>>1;
1533         block[stride*10+offset]= (z0 - z3)>>1;
1534     }
1535 }
1536 #endif
1537
1538 #undef xStride
1539 #undef stride
1540
1541 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1542     const int stride= 16*2;
1543     const int xStride= 16;
1544     int a,b,c,d,e;
1545
1546     a= block[stride*0 + xStride*0];
1547     b= block[stride*0 + xStride*1];
1548     c= block[stride*1 + xStride*0];
1549     d= block[stride*1 + xStride*1];
1550
1551     e= a-b;
1552     a= a+b;
1553     b= c-d;
1554     c= c+d;
1555
1556     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1557     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1558     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1559     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1560 }
1561
1562 #if 0
1563 static void chroma_dc_dct_c(DCTELEM *block){
1564     const int stride= 16*2;
1565     const int xStride= 16;
1566     int a,b,c,d,e;
1567
1568     a= block[stride*0 + xStride*0];
1569     b= block[stride*0 + xStride*1];
1570     c= block[stride*1 + xStride*0];
1571     d= block[stride*1 + xStride*1];
1572
1573     e= a-b;
1574     a= a+b;
1575     b= c-d;
1576     c= c+d;
1577
1578     block[stride*0 + xStride*0]= (a+c);
1579     block[stride*0 + xStride*1]= (e+b);
1580     block[stride*1 + xStride*0]= (a-c);
1581     block[stride*1 + xStride*1]= (e-b);
1582 }
1583 #endif
1584
1585 /**
1586  * gets the chroma qp.
1587  */
1588 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1589     return h->pps.chroma_qp_table[t][qscale];
1590 }
1591
1592 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1593                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1594                            int src_x_offset, int src_y_offset,
1595                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1596     MpegEncContext * const s = &h->s;
1597     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1598     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1599     const int luma_xy= (mx&3) + ((my&3)<<2);
1600     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1601     uint8_t * src_cb, * src_cr;
1602     int extra_width= h->emu_edge_width;
1603     int extra_height= h->emu_edge_height;
1604     int emu=0;
1605     const int full_mx= mx>>2;
1606     const int full_my= my>>2;
1607     const int pic_width  = 16*s->mb_width;
1608     const int pic_height = 16*s->mb_height >> MB_FIELD;
1609
1610     if(mx&7) extra_width -= 3;
1611     if(my&7) extra_height -= 3;
1612
1613     if(   full_mx < 0-extra_width
1614        || full_my < 0-extra_height
1615        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1616        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1617         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1618             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1619         emu=1;
1620     }
1621
1622     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1623     if(!square){
1624         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1625     }
1626
1627     if(ENABLE_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1628
1629     if(MB_FIELD){
1630         // chroma offset when predicting from a field of opposite parity
1631         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1632         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1633     }
1634     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1635     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1636
1637     if(emu){
1638         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1639             src_cb= s->edge_emu_buffer;
1640     }
1641     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1642
1643     if(emu){
1644         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1645             src_cr= s->edge_emu_buffer;
1646     }
1647     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1648 }
1649
1650 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1651                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1652                            int x_offset, int y_offset,
1653                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1654                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1655                            int list0, int list1){
1656     MpegEncContext * const s = &h->s;
1657     qpel_mc_func *qpix_op=  qpix_put;
1658     h264_chroma_mc_func chroma_op= chroma_put;
1659
1660     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1661     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1662     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1663     x_offset += 8*s->mb_x;
1664     y_offset += 8*(s->mb_y >> MB_FIELD);
1665
1666     if(list0){
1667         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1668         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1669                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1670                            qpix_op, chroma_op);
1671
1672         qpix_op=  qpix_avg;
1673         chroma_op= chroma_avg;
1674     }
1675
1676     if(list1){
1677         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1678         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1679                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1680                            qpix_op, chroma_op);
1681     }
1682 }
1683
1684 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1685                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1686                            int x_offset, int y_offset,
1687                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1688                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1689                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1690                            int list0, int list1){
1691     MpegEncContext * const s = &h->s;
1692
1693     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1694     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1695     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1696     x_offset += 8*s->mb_x;
1697     y_offset += 8*(s->mb_y >> MB_FIELD);
1698
1699     if(list0 && list1){
1700         /* don't optimize for luma-only case, since B-frames usually
1701          * use implicit weights => chroma too. */
1702         uint8_t *tmp_cb = s->obmc_scratchpad;
1703         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1704         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1705         int refn0 = h->ref_cache[0][ scan8[n] ];
1706         int refn1 = h->ref_cache[1][ scan8[n] ];
1707
1708         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1709                     dest_y, dest_cb, dest_cr,
1710                     x_offset, y_offset, qpix_put, chroma_put);
1711         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1712                     tmp_y, tmp_cb, tmp_cr,
1713                     x_offset, y_offset, qpix_put, chroma_put);
1714
1715         if(h->use_weight == 2){
1716             int weight0 = h->implicit_weight[refn0][refn1];
1717             int weight1 = 64 - weight0;
1718             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1719             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1720             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1721         }else{
1722             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1723                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1724                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1725             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1726                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1727                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1728             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1729                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1730                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1731         }
1732     }else{
1733         int list = list1 ? 1 : 0;
1734         int refn = h->ref_cache[list][ scan8[n] ];
1735         Picture *ref= &h->ref_list[list][refn];
1736         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1737                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1738                     qpix_put, chroma_put);
1739
1740         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1741                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1742         if(h->use_weight_chroma){
1743             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1744                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1745             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1746                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1747         }
1748     }
1749 }
1750
1751 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1752                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1753                            int x_offset, int y_offset,
1754                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1755                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1756                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1757                            int list0, int list1){
1758     if((h->use_weight==2 && list0 && list1
1759         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1760        || h->use_weight==1)
1761         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1762                          x_offset, y_offset, qpix_put, chroma_put,
1763                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1764     else
1765         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1766                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1767 }
1768
1769 static inline void prefetch_motion(H264Context *h, int list){
1770     /* fetch pixels for estimated mv 4 macroblocks ahead
1771      * optimized for 64byte cache lines */
1772     MpegEncContext * const s = &h->s;
1773     const int refn = h->ref_cache[list][scan8[0]];
1774     if(refn >= 0){
1775         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1776         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1777         uint8_t **src= h->ref_list[list][refn].data;
1778         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1779         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1780         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1781         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1782     }
1783 }
1784
1785 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1786                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1787                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1788                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1789     MpegEncContext * const s = &h->s;
1790     const int mb_xy= h->mb_xy;
1791     const int mb_type= s->current_picture.mb_type[mb_xy];
1792
1793     assert(IS_INTER(mb_type));
1794
1795     prefetch_motion(h, 0);
1796
1797     if(IS_16X16(mb_type)){
1798         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1799                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1800                 &weight_op[0], &weight_avg[0],
1801                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1802     }else if(IS_16X8(mb_type)){
1803         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1804                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1805                 &weight_op[1], &weight_avg[1],
1806                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1807         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1808                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1809                 &weight_op[1], &weight_avg[1],
1810                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1811     }else if(IS_8X16(mb_type)){
1812         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1813                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1814                 &weight_op[2], &weight_avg[2],
1815                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1816         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1817                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1818                 &weight_op[2], &weight_avg[2],
1819                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1820     }else{
1821         int i;
1822
1823         assert(IS_8X8(mb_type));
1824
1825         for(i=0; i<4; i++){
1826             const int sub_mb_type= h->sub_mb_type[i];
1827             const int n= 4*i;
1828             int x_offset= (i&1)<<2;
1829             int y_offset= (i&2)<<1;
1830
1831             if(IS_SUB_8X8(sub_mb_type)){
1832                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1833                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1834                     &weight_op[3], &weight_avg[3],
1835                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1836             }else if(IS_SUB_8X4(sub_mb_type)){
1837                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1838                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1839                     &weight_op[4], &weight_avg[4],
1840                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1841                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1842                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1843                     &weight_op[4], &weight_avg[4],
1844                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1845             }else if(IS_SUB_4X8(sub_mb_type)){
1846                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1847                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1848                     &weight_op[5], &weight_avg[5],
1849                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1850                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1851                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1852                     &weight_op[5], &weight_avg[5],
1853                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1854             }else{
1855                 int j;
1856                 assert(IS_SUB_4X4(sub_mb_type));
1857                 for(j=0; j<4; j++){
1858                     int sub_x_offset= x_offset + 2*(j&1);
1859                     int sub_y_offset= y_offset +   (j&2);
1860                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1861                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1862                         &weight_op[6], &weight_avg[6],
1863                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1864                 }
1865             }
1866         }
1867     }
1868
1869     prefetch_motion(h, 1);
1870 }
1871
1872 static av_cold void decode_init_vlc(void){
1873     static int done = 0;
1874
1875     if (!done) {
1876         int i;
1877         int offset;
1878         done = 1;
1879
1880         chroma_dc_coeff_token_vlc.table = chroma_dc_coeff_token_vlc_table;
1881         chroma_dc_coeff_token_vlc.table_allocated = chroma_dc_coeff_token_vlc_table_size;
1882         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1883                  &chroma_dc_coeff_token_len [0], 1, 1,
1884                  &chroma_dc_coeff_token_bits[0], 1, 1,
1885                  INIT_VLC_USE_NEW_STATIC);
1886
1887         offset = 0;
1888         for(i=0; i<4; i++){
1889             coeff_token_vlc[i].table = coeff_token_vlc_tables+offset;
1890             coeff_token_vlc[i].table_allocated = coeff_token_vlc_tables_size[i];
1891             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1892                      &coeff_token_len [i][0], 1, 1,
1893                      &coeff_token_bits[i][0], 1, 1,
1894                      INIT_VLC_USE_NEW_STATIC);
1895             offset += coeff_token_vlc_tables_size[i];
1896         }
1897         /*
1898          * This is a one time safety check to make sure that
1899          * the packed static coeff_token_vlc table sizes
1900          * were initialized correctly.
1901          */
1902         assert(offset == FF_ARRAY_ELEMS(coeff_token_vlc_tables));
1903
1904         for(i=0; i<3; i++){
1905             chroma_dc_total_zeros_vlc[i].table = chroma_dc_total_zeros_vlc_tables[i];
1906             chroma_dc_total_zeros_vlc[i].table_allocated = chroma_dc_total_zeros_vlc_tables_size;
1907             init_vlc(&chroma_dc_total_zeros_vlc[i],
1908                      CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1909                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1910                      &chroma_dc_total_zeros_bits[i][0], 1, 1,
1911                      INIT_VLC_USE_NEW_STATIC);
1912         }
1913         for(i=0; i<15; i++){
1914             total_zeros_vlc[i].table = total_zeros_vlc_tables[i];
1915             total_zeros_vlc[i].table_allocated = total_zeros_vlc_tables_size;
1916             init_vlc(&total_zeros_vlc[i],
1917                      TOTAL_ZEROS_VLC_BITS, 16,
1918                      &total_zeros_len [i][0], 1, 1,
1919                      &total_zeros_bits[i][0], 1, 1,
1920                      INIT_VLC_USE_NEW_STATIC);
1921         }
1922
1923         for(i=0; i<6; i++){
1924             run_vlc[i].table = run_vlc_tables[i];
1925             run_vlc[i].table_allocated = run_vlc_tables_size;
1926             init_vlc(&run_vlc[i],
1927                      RUN_VLC_BITS, 7,
1928                      &run_len [i][0], 1, 1,
1929                      &run_bits[i][0], 1, 1,
1930                      INIT_VLC_USE_NEW_STATIC);
1931         }
1932         run7_vlc.table = run7_vlc_table,
1933         run7_vlc.table_allocated = run7_vlc_table_size;
1934         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
1935                  &run_len [6][0], 1, 1,
1936                  &run_bits[6][0], 1, 1,
1937                  INIT_VLC_USE_NEW_STATIC);
1938     }
1939 }
1940
1941 static void free_tables(H264Context *h){
1942     int i;
1943     H264Context *hx;
1944     av_freep(&h->intra4x4_pred_mode);
1945     av_freep(&h->chroma_pred_mode_table);
1946     av_freep(&h->cbp_table);
1947     av_freep(&h->mvd_table[0]);
1948     av_freep(&h->mvd_table[1]);
1949     av_freep(&h->direct_table);
1950     av_freep(&h->non_zero_count);
1951     av_freep(&h->slice_table_base);
1952     h->slice_table= NULL;
1953
1954     av_freep(&h->mb2b_xy);
1955     av_freep(&h->mb2b8_xy);
1956
1957     for(i = 0; i < h->s.avctx->thread_count; i++) {
1958         hx = h->thread_context[i];
1959         if(!hx) continue;
1960         av_freep(&hx->top_borders[1]);
1961         av_freep(&hx->top_borders[0]);
1962         av_freep(&hx->s.obmc_scratchpad);
1963     }
1964 }
1965
1966 static void init_dequant8_coeff_table(H264Context *h){
1967     int i,q,x;
1968     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
1969     h->dequant8_coeff[0] = h->dequant8_buffer[0];
1970     h->dequant8_coeff[1] = h->dequant8_buffer[1];
1971
1972     for(i=0; i<2; i++ ){
1973         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
1974             h->dequant8_coeff[1] = h->dequant8_buffer[0];
1975             break;
1976         }
1977
1978         for(q=0; q<52; q++){
1979             int shift = div6[q];
1980             int idx = rem6[q];
1981             for(x=0; x<64; x++)
1982                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
1983                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
1984                     h->pps.scaling_matrix8[i][x]) << shift;
1985         }
1986     }
1987 }
1988
1989 static void init_dequant4_coeff_table(H264Context *h){
1990     int i,j,q,x;
1991     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
1992     for(i=0; i<6; i++ ){
1993         h->dequant4_coeff[i] = h->dequant4_buffer[i];
1994         for(j=0; j<i; j++){
1995             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
1996                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
1997                 break;
1998             }
1999         }
2000         if(j<i)
2001             continue;
2002
2003         for(q=0; q<52; q++){
2004             int shift = div6[q] + 2;
2005             int idx = rem6[q];
2006             for(x=0; x<16; x++)
2007                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2008                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2009                     h->pps.scaling_matrix4[i][x]) << shift;
2010         }
2011     }
2012 }
2013
2014 static void init_dequant_tables(H264Context *h){
2015     int i,x;
2016     init_dequant4_coeff_table(h);
2017     if(h->pps.transform_8x8_mode)
2018         init_dequant8_coeff_table(h);
2019     if(h->sps.transform_bypass){
2020         for(i=0; i<6; i++)
2021             for(x=0; x<16; x++)
2022                 h->dequant4_coeff[i][0][x] = 1<<6;
2023         if(h->pps.transform_8x8_mode)
2024             for(i=0; i<2; i++)
2025                 for(x=0; x<64; x++)
2026                     h->dequant8_coeff[i][0][x] = 1<<6;
2027     }
2028 }
2029
2030
2031 /**
2032  * allocates tables.
2033  * needs width/height
2034  */
2035 static int alloc_tables(H264Context *h){
2036     MpegEncContext * const s = &h->s;
2037     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2038     int x,y;
2039
2040     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2041
2042     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2043     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(*h->slice_table_base))
2044     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2045
2046     CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2047     CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2048     CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2049     CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2050
2051     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(*h->slice_table_base));
2052     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2053
2054     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2055     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2056     for(y=0; y<s->mb_height; y++){
2057         for(x=0; x<s->mb_width; x++){
2058             const int mb_xy= x + y*s->mb_stride;
2059             const int b_xy = 4*x + 4*y*h->b_stride;
2060             const int b8_xy= 2*x + 2*y*h->b8_stride;
2061
2062             h->mb2b_xy [mb_xy]= b_xy;
2063             h->mb2b8_xy[mb_xy]= b8_xy;
2064         }
2065     }
2066
2067     s->obmc_scratchpad = NULL;
2068
2069     if(!h->dequant4_coeff[0])
2070         init_dequant_tables(h);
2071
2072     return 0;
2073 fail:
2074     free_tables(h);
2075     return -1;
2076 }
2077
2078 /**
2079  * Mimic alloc_tables(), but for every context thread.
2080  */
2081 static void clone_tables(H264Context *dst, H264Context *src){
2082     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2083     dst->non_zero_count           = src->non_zero_count;
2084     dst->slice_table              = src->slice_table;
2085     dst->cbp_table                = src->cbp_table;
2086     dst->mb2b_xy                  = src->mb2b_xy;
2087     dst->mb2b8_xy                 = src->mb2b8_xy;
2088     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2089     dst->mvd_table[0]             = src->mvd_table[0];
2090     dst->mvd_table[1]             = src->mvd_table[1];
2091     dst->direct_table             = src->direct_table;
2092
2093     dst->s.obmc_scratchpad = NULL;
2094     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2095 }
2096
2097 /**
2098  * Init context
2099  * Allocate buffers which are not shared amongst multiple threads.
2100  */
2101 static int context_init(H264Context *h){
2102     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2103     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2104
2105     return 0;
2106 fail:
2107     return -1; // free_tables will clean up for us
2108 }
2109
2110 static av_cold void common_init(H264Context *h){
2111     MpegEncContext * const s = &h->s;
2112
2113     s->width = s->avctx->width;
2114     s->height = s->avctx->height;
2115     s->codec_id= s->avctx->codec->id;
2116
2117     ff_h264_pred_init(&h->hpc, s->codec_id);
2118
2119     h->dequant_coeff_pps= -1;
2120     s->unrestricted_mv=1;
2121     s->decode=1; //FIXME
2122
2123     dsputil_init(&s->dsp, s->avctx); // needed so that idct permutation is known early
2124
2125     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2126     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2127 }
2128
2129 static av_cold int decode_init(AVCodecContext *avctx){
2130     H264Context *h= avctx->priv_data;
2131     MpegEncContext * const s = &h->s;
2132
2133     MPV_decode_defaults(s);
2134
2135     s->avctx = avctx;
2136     common_init(h);
2137
2138     s->out_format = FMT_H264;
2139     s->workaround_bugs= avctx->workaround_bugs;
2140
2141     // set defaults
2142 //    s->decode_mb= ff_h263_decode_mb;
2143     s->quarter_sample = 1;
2144     s->low_delay= 1;
2145
2146     if(avctx->codec_id == CODEC_ID_SVQ3)
2147         avctx->pix_fmt= PIX_FMT_YUVJ420P;
2148     else
2149         avctx->pix_fmt= PIX_FMT_YUV420P;
2150
2151     decode_init_vlc();
2152
2153     if(avctx->extradata_size > 0 && avctx->extradata &&
2154        *(char *)avctx->extradata == 1){
2155         h->is_avc = 1;
2156         h->got_avcC = 0;
2157     } else {
2158         h->is_avc = 0;
2159     }
2160
2161     h->thread_context[0] = h;
2162     h->outputed_poc = INT_MIN;
2163     h->prev_poc_msb= 1<<16;
2164     return 0;
2165 }
2166
2167 static int frame_start(H264Context *h){
2168     MpegEncContext * const s = &h->s;
2169     int i;
2170
2171     if(MPV_frame_start(s, s->avctx) < 0)
2172         return -1;
2173     ff_er_frame_start(s);
2174     /*
2175      * MPV_frame_start uses pict_type to derive key_frame.
2176      * This is incorrect for H.264; IDR markings must be used.
2177      * Zero here; IDR markings per slice in frame or fields are ORed in later.
2178      * See decode_nal_units().
2179      */
2180     s->current_picture_ptr->key_frame= 0;
2181
2182     assert(s->linesize && s->uvlinesize);
2183
2184     for(i=0; i<16; i++){
2185         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2186         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2187     }
2188     for(i=0; i<4; i++){
2189         h->block_offset[16+i]=
2190         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2191         h->block_offset[24+16+i]=
2192         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2193     }
2194
2195     /* can't be in alloc_tables because linesize isn't known there.
2196      * FIXME: redo bipred weight to not require extra buffer? */
2197     for(i = 0; i < s->avctx->thread_count; i++)
2198         if(!h->thread_context[i]->s.obmc_scratchpad)
2199             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2200
2201     /* some macroblocks will be accessed before they're available */
2202     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2203         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(*h->slice_table));
2204
2205 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2206
2207     // We mark the current picture as non-reference after allocating it, so
2208     // that if we break out due to an error it can be released automatically
2209     // in the next MPV_frame_start().
2210     // SVQ3 as well as most other codecs have only last/next/current and thus
2211     // get released even with set reference, besides SVQ3 and others do not
2212     // mark frames as reference later "naturally".
2213     if(s->codec_id != CODEC_ID_SVQ3)
2214         s->current_picture_ptr->reference= 0;
2215
2216     s->current_picture_ptr->field_poc[0]=
2217     s->current_picture_ptr->field_poc[1]= INT_MAX;
2218     assert(s->current_picture_ptr->long_ref==0);
2219
2220     return 0;
2221 }
2222
2223 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2224     MpegEncContext * const s = &h->s;
2225     int i;
2226     int step    = 1;
2227     int offset  = 1;
2228     int uvoffset= 1;
2229     int top_idx = 1;
2230     int skiplast= 0;
2231
2232     src_y  -=   linesize;
2233     src_cb -= uvlinesize;
2234     src_cr -= uvlinesize;
2235
2236     if(!simple && FRAME_MBAFF){
2237         if(s->mb_y&1){
2238             offset  = MB_MBAFF ? 1 : 17;
2239             uvoffset= MB_MBAFF ? 1 : 9;
2240             if(!MB_MBAFF){
2241                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 0)= *(uint64_t*)(src_y +  15*linesize);
2242                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 8)= *(uint64_t*)(src_y +8+15*linesize);
2243                 if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2244                     *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+7*uvlinesize);
2245                     *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+7*uvlinesize);
2246                 }
2247             }
2248         }else{
2249             if(!MB_MBAFF){
2250                 h->left_border[0]= h->top_borders[0][s->mb_x][15];
2251                 if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2252                     h->left_border[34   ]= h->top_borders[0][s->mb_x][16+7  ];
2253                     h->left_border[34+18]= h->top_borders[0][s->mb_x][16+8+7];
2254                 }
2255                 skiplast= 1;
2256             }
2257             offset  =
2258             uvoffset=
2259             top_idx = MB_MBAFF ? 0 : 1;
2260         }
2261         step= MB_MBAFF ? 2 : 1;
2262     }
2263
2264     // There are two lines saved, the line above the the top macroblock of a pair,
2265     // and the line above the bottom macroblock
2266     h->left_border[offset]= h->top_borders[top_idx][s->mb_x][15];
2267     for(i=1; i<17 - skiplast; i++){
2268         h->left_border[offset+i*step]= src_y[15+i*  linesize];
2269     }
2270
2271     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2272     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2273
2274     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2275         h->left_border[uvoffset+34   ]= h->top_borders[top_idx][s->mb_x][16+7];
2276         h->left_border[uvoffset+34+18]= h->top_borders[top_idx][s->mb_x][24+7];
2277         for(i=1; i<9 - skiplast; i++){
2278             h->left_border[uvoffset+34   +i*step]= src_cb[7+i*uvlinesize];
2279             h->left_border[uvoffset+34+18+i*step]= src_cr[7+i*uvlinesize];
2280         }
2281         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2282         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2283     }
2284 }
2285
2286 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2287     MpegEncContext * const s = &h->s;
2288     int temp8, i;
2289     uint64_t temp64;
2290     int deblock_left;
2291     int deblock_top;
2292     int mb_xy;
2293     int step    = 1;
2294     int offset  = 1;
2295     int uvoffset= 1;
2296     int top_idx = 1;
2297
2298     if(!simple && FRAME_MBAFF){
2299         if(s->mb_y&1){
2300             offset  = MB_MBAFF ? 1 : 17;
2301             uvoffset= MB_MBAFF ? 1 : 9;
2302         }else{
2303             offset  =
2304             uvoffset=
2305             top_idx = MB_MBAFF ? 0 : 1;
2306         }
2307         step= MB_MBAFF ? 2 : 1;
2308     }
2309
2310     if(h->deblocking_filter == 2) {
2311         mb_xy = h->mb_xy;
2312         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2313         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2314     } else {
2315         deblock_left = (s->mb_x > 0);
2316         deblock_top =  (s->mb_y > !!MB_FIELD);
2317     }
2318
2319     src_y  -=   linesize + 1;
2320     src_cb -= uvlinesize + 1;
2321     src_cr -= uvlinesize + 1;
2322
2323 #define XCHG(a,b,t,xchg)\
2324 t= a;\
2325 if(xchg)\
2326     a= b;\
2327 b= t;
2328
2329     if(deblock_left){
2330         for(i = !deblock_top; i<16; i++){
2331             XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, xchg);
2332         }
2333         XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, 1);
2334     }
2335
2336     if(deblock_top){
2337         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2338         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2339         if(s->mb_x+1 < s->mb_width){
2340             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2341         }
2342     }
2343
2344     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2345         if(deblock_left){
2346             for(i = !deblock_top; i<8; i++){
2347                 XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, xchg);
2348                 XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, xchg);
2349             }
2350             XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, 1);
2351             XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, 1);
2352         }
2353         if(deblock_top){
2354             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2355             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2356         }
2357     }
2358 }
2359
2360 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2361     MpegEncContext * const s = &h->s;
2362     const int mb_x= s->mb_x;
2363     const int mb_y= s->mb_y;
2364     const int mb_xy= h->mb_xy;
2365     const int mb_type= s->current_picture.mb_type[mb_xy];
2366     uint8_t  *dest_y, *dest_cb, *dest_cr;
2367     int linesize, uvlinesize /*dct_offset*/;
2368     int i;
2369     int *block_offset = &h->block_offset[0];
2370     const int transform_bypass = !simple && (s->qscale == 0 && h->sps.transform_bypass);
2371     const int is_h264 = simple || s->codec_id == CODEC_ID_H264;
2372     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2373     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2374
2375     dest_y  = s->current_picture.data[0] + (mb_x + mb_y * s->linesize  ) * 16;
2376     dest_cb = s->current_picture.data[1] + (mb_x + mb_y * s->uvlinesize) * 8;
2377     dest_cr = s->current_picture.data[2] + (mb_x + mb_y * s->uvlinesize) * 8;
2378
2379     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2380     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2381
2382     if (!simple && MB_FIELD) {
2383         linesize   = h->mb_linesize   = s->linesize * 2;
2384         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2385         block_offset = &h->block_offset[24];
2386         if(mb_y&1){ //FIXME move out of this function?
2387             dest_y -= s->linesize*15;
2388             dest_cb-= s->uvlinesize*7;
2389             dest_cr-= s->uvlinesize*7;
2390         }
2391         if(FRAME_MBAFF) {
2392             int list;
2393             for(list=0; list<h->list_count; list++){
2394                 if(!USES_LIST(mb_type, list))
2395                     continue;
2396                 if(IS_16X16(mb_type)){
2397                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2398                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2399                 }else{
2400                     for(i=0; i<16; i+=4){
2401                         int ref = h->ref_cache[list][scan8[i]];
2402                         if(ref >= 0)
2403                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2404                     }
2405                 }
2406             }
2407         }
2408     } else {
2409         linesize   = h->mb_linesize   = s->linesize;
2410         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2411 //        dct_offset = s->linesize * 16;
2412     }
2413
2414     if (!simple && IS_INTRA_PCM(mb_type)) {
2415         for (i=0; i<16; i++) {
2416             memcpy(dest_y + i*  linesize, h->mb       + i*8, 16);
2417         }
2418         for (i=0; i<8; i++) {
2419             memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
2420             memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
2421         }
2422     } else {
2423         if(IS_INTRA(mb_type)){
2424             if(h->deblocking_filter)
2425                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2426
2427             if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2428                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2429                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2430             }
2431
2432             if(IS_INTRA4x4(mb_type)){
2433                 if(simple || !s->encoding){
2434                     if(IS_8x8DCT(mb_type)){
2435                         if(transform_bypass){
2436                             idct_dc_add =
2437                             idct_add    = s->dsp.add_pixels8;
2438                         }else{
2439                             idct_dc_add = s->dsp.h264_idct8_dc_add;
2440                             idct_add    = s->dsp.h264_idct8_add;
2441                         }
2442                         for(i=0; i<16; i+=4){
2443                             uint8_t * const ptr= dest_y + block_offset[i];
2444                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2445                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2446                                 h->hpc.pred8x8l_add[dir](ptr, h->mb + i*16, linesize);
2447                             }else{
2448                                 const int nnz = h->non_zero_count_cache[ scan8[i] ];
2449                                 h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2450                                                             (h->topright_samples_available<<i)&0x4000, linesize);
2451                                 if(nnz){
2452                                     if(nnz == 1 && h->mb[i*16])
2453                                         idct_dc_add(ptr, h->mb + i*16, linesize);
2454                                     else
2455                                         idct_add   (ptr, h->mb + i*16, linesize);
2456                                 }
2457                             }
2458                         }
2459                     }else{
2460                         if(transform_bypass){
2461                             idct_dc_add =
2462                             idct_add    = s->dsp.add_pixels4;
2463                         }else{
2464                             idct_dc_add = s->dsp.h264_idct_dc_add;
2465                             idct_add    = s->dsp.h264_idct_add;
2466                         }
2467                         for(i=0; i<16; i++){
2468                             uint8_t * const ptr= dest_y + block_offset[i];
2469                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2470
2471                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2472                                 h->hpc.pred4x4_add[dir](ptr, h->mb + i*16, linesize);
2473                             }else{
2474                                 uint8_t *topright;
2475                                 int nnz, tr;
2476                                 if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2477                                     const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2478                                     assert(mb_y || linesize <= block_offset[i]);
2479                                     if(!topright_avail){
2480                                         tr= ptr[3 - linesize]*0x01010101;
2481                                         topright= (uint8_t*) &tr;
2482                                     }else
2483                                         topright= ptr + 4 - linesize;
2484                                 }else
2485                                     topright= NULL;
2486
2487                                 h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2488                                 nnz = h->non_zero_count_cache[ scan8[i] ];
2489                                 if(nnz){
2490                                     if(is_h264){
2491                                         if(nnz == 1 && h->mb[i*16])
2492                                             idct_dc_add(ptr, h->mb + i*16, linesize);
2493                                         else
2494                                             idct_add   (ptr, h->mb + i*16, linesize);
2495                                     }else
2496                                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2497                                 }
2498                             }
2499                         }
2500                     }
2501                 }
2502             }else{
2503                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2504                 if(is_h264){
2505                     if(!transform_bypass)
2506                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2507                 }else
2508                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2509             }
2510             if(h->deblocking_filter)
2511                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2512         }else if(is_h264){
2513             hl_motion(h, dest_y, dest_cb, dest_cr,
2514                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2515                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2516                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2517         }
2518
2519
2520         if(!IS_INTRA4x4(mb_type)){
2521             if(is_h264){
2522                 if(IS_INTRA16x16(mb_type)){
2523                     if(transform_bypass){
2524                         if(h->sps.profile_idc==244 && (h->intra16x16_pred_mode==VERT_PRED8x8 || h->intra16x16_pred_mode==HOR_PRED8x8)){
2525                             h->hpc.pred16x16_add[h->intra16x16_pred_mode](dest_y, block_offset, h->mb, linesize);
2526                         }else{
2527                             for(i=0; i<16; i++){
2528                                 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2529                                     s->dsp.add_pixels4(dest_y + block_offset[i], h->mb + i*16, linesize);
2530                             }
2531                         }
2532                     }else{
2533                          s->dsp.h264_idct_add16intra(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2534                     }
2535                 }else if(h->cbp&15){
2536                     if(transform_bypass){
2537                         const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2538                         idct_add= IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2539                         for(i=0; i<16; i+=di){
2540                             if(h->non_zero_count_cache[ scan8[i] ]){
2541                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2542                             }
2543                         }
2544                     }else{
2545                         if(IS_8x8DCT(mb_type)){
2546                             s->dsp.h264_idct8_add4(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2547                         }else{
2548                             s->dsp.h264_idct_add16(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2549                         }
2550                     }
2551                 }
2552             }else{
2553                 for(i=0; i<16; i++){
2554                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2555                         uint8_t * const ptr= dest_y + block_offset[i];
2556                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2557                     }
2558                 }
2559             }
2560         }
2561
2562         if((simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)) && (h->cbp&0x30)){
2563             uint8_t *dest[2] = {dest_cb, dest_cr};
2564             if(transform_bypass){
2565                 if(IS_INTRA(mb_type) && h->sps.profile_idc==244 && (h->chroma_pred_mode==VERT_PRED8x8 || h->chroma_pred_mode==HOR_PRED8x8)){
2566                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0], block_offset + 16, h->mb + 16*16, uvlinesize);
2567                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1], block_offset + 20, h->mb + 20*16, uvlinesize);
2568                 }else{
2569                     idct_add = s->dsp.add_pixels4;
2570                     for(i=16; i<16+8; i++){
2571                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2572                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2573                     }
2574                 }
2575             }else{
2576                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2577                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2578                 if(is_h264){
2579                     idct_add = s->dsp.h264_idct_add;
2580                     idct_dc_add = s->dsp.h264_idct_dc_add;
2581                     for(i=16; i<16+8; i++){
2582                         if(h->non_zero_count_cache[ scan8[i] ])
2583                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2584                         else if(h->mb[i*16])
2585                             idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2586                     }
2587                 }else{
2588                     for(i=16; i<16+8; i++){
2589                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2590                             uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2591                             svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2592                         }
2593                     }
2594                 }
2595             }
2596         }
2597     }
2598     if(h->cbp || IS_INTRA(mb_type))
2599         s->dsp.clear_blocks(h->mb);
2600
2601     if(h->deblocking_filter) {
2602         backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2603         fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2604         h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2605         h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2606         if (!simple && FRAME_MBAFF) {
2607             filter_mb     (h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2608         } else {
2609             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2610         }
2611     }
2612 }
2613
2614 /**
2615  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2616  */
2617 static void hl_decode_mb_simple(H264Context *h){
2618     hl_decode_mb_internal(h, 1);
2619 }
2620
2621 /**
2622  * Process a macroblock; this handles edge cases, such as interlacing.
2623  */
2624 static void av_noinline hl_decode_mb_complex(H264Context *h){
2625     hl_decode_mb_internal(h, 0);
2626 }
2627
2628 static void hl_decode_mb(H264Context *h){
2629     MpegEncContext * const s = &h->s;
2630     const int mb_xy= h->mb_xy;
2631     const int mb_type= s->current_picture.mb_type[mb_xy];
2632     int is_complex = ENABLE_SMALL || h->is_complex || IS_INTRA_PCM(mb_type) || s->qscale == 0;
2633
2634     if(ENABLE_H264_ENCODER && !s->decode)
2635         return;
2636
2637     if (is_complex)
2638         hl_decode_mb_complex(h);
2639     else hl_decode_mb_simple(h);
2640 }
2641
2642 static void pic_as_field(Picture *pic, const int parity){
2643     int i;
2644     for (i = 0; i < 4; ++i) {
2645         if (parity == PICT_BOTTOM_FIELD)
2646             pic->data[i] += pic->linesize[i];
2647         pic->reference = parity;
2648         pic->linesize[i] *= 2;
2649     }
2650     pic->poc= pic->field_poc[parity == PICT_BOTTOM_FIELD];
2651 }
2652
2653 static int split_field_copy(Picture *dest, Picture *src,
2654                             int parity, int id_add){
2655     int match = !!(src->reference & parity);
2656
2657     if (match) {
2658         *dest = *src;
2659         if(parity != PICT_FRAME){
2660             pic_as_field(dest, parity);
2661             dest->pic_id *= 2;
2662             dest->pic_id += id_add;
2663         }
2664     }
2665
2666     return match;
2667 }
2668
2669 static int build_def_list(Picture *def, Picture **in, int len, int is_long, int sel){
2670     int i[2]={0};
2671     int index=0;
2672
2673     while(i[0]<len || i[1]<len){
2674         while(i[0]<len && !(in[ i[0] ] && (in[ i[0] ]->reference & sel)))
2675             i[0]++;
2676         while(i[1]<len && !(in[ i[1] ] && (in[ i[1] ]->reference & (sel^3))))
2677             i[1]++;
2678         if(i[0] < len){
2679             in[ i[0] ]->pic_id= is_long ? i[0] : in[ i[0] ]->frame_num;
2680             split_field_copy(&def[index++], in[ i[0]++ ], sel  , 1);
2681         }
2682         if(i[1] < len){
2683             in[ i[1] ]->pic_id= is_long ? i[1] : in[ i[1] ]->frame_num;
2684             split_field_copy(&def[index++], in[ i[1]++ ], sel^3, 0);
2685         }
2686     }
2687
2688     return index;
2689 }
2690
2691 static int add_sorted(Picture **sorted, Picture **src, int len, int limit, int dir){
2692     int i, best_poc;
2693     int out_i= 0;
2694
2695     for(;;){
2696         best_poc= dir ? INT_MIN : INT_MAX;
2697
2698         for(i=0; i<len; i++){
2699             const int poc= src[i]->poc;
2700             if(((poc > limit) ^ dir) && ((poc < best_poc) ^ dir)){
2701                 best_poc= poc;
2702                 sorted[out_i]= src[i];
2703             }
2704         }
2705         if(best_poc == (dir ? INT_MIN : INT_MAX))
2706             break;
2707         limit= sorted[out_i++]->poc - dir;
2708     }
2709     return out_i;
2710 }
2711
2712 /**
2713  * fills the default_ref_list.
2714  */
2715 static int fill_default_ref_list(H264Context *h){
2716     MpegEncContext * const s = &h->s;
2717     int i, len;
2718
2719     if(h->slice_type_nos==FF_B_TYPE){
2720         Picture *sorted[32];
2721         int cur_poc, list;
2722         int lens[2];
2723
2724         if(FIELD_PICTURE)
2725             cur_poc= s->current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
2726         else
2727             cur_poc= s->current_picture_ptr->poc;
2728
2729         for(list= 0; list<2; list++){
2730             len= add_sorted(sorted    , h->short_ref, h->short_ref_count, cur_poc, 1^list);
2731             len+=add_sorted(sorted+len, h->short_ref, h->short_ref_count, cur_poc, 0^list);
2732             assert(len<=32);
2733             len= build_def_list(h->default_ref_list[list]    , sorted     , len, 0, s->picture_structure);
2734             len+=build_def_list(h->default_ref_list[list]+len, h->long_ref, 16 , 1, s->picture_structure);
2735             assert(len<=32);
2736
2737             if(len < h->ref_count[list])
2738                 memset(&h->default_ref_list[list][len], 0, sizeof(Picture)*(h->ref_count[list] - len));
2739             lens[list]= len;
2740         }
2741
2742         if(lens[0] == lens[1] && lens[1] > 1){
2743             for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0] && i<lens[0]; i++);
2744             if(i == lens[0])
2745                 FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2746         }
2747     }else{
2748         len = build_def_list(h->default_ref_list[0]    , h->short_ref, h->short_ref_count, 0, s->picture_structure);
2749         len+= build_def_list(h->default_ref_list[0]+len, h-> long_ref, 16                , 1, s->picture_structure);
2750         assert(len <= 32);
2751         if(len < h->ref_count[0])
2752             memset(&h->default_ref_list[0][len], 0, sizeof(Picture)*(h->ref_count[0] - len));
2753     }
2754 #ifdef TRACE
2755     for (i=0; i<h->ref_count[0]; i++) {
2756         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2757     }
2758     if(h->slice_type_nos==FF_B_TYPE){
2759         for (i=0; i<h->ref_count[1]; i++) {
2760             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
2761         }
2762     }
2763 #endif
2764     return 0;
2765 }
2766
2767 static void print_short_term(H264Context *h);
2768 static void print_long_term(H264Context *h);
2769
2770 /**
2771  * Extract structure information about the picture described by pic_num in
2772  * the current decoding context (frame or field). Note that pic_num is
2773  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
2774  * @param pic_num picture number for which to extract structure information
2775  * @param structure one of PICT_XXX describing structure of picture
2776  *                      with pic_num
2777  * @return frame number (short term) or long term index of picture
2778  *         described by pic_num
2779  */
2780 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
2781     MpegEncContext * const s = &h->s;
2782
2783     *structure = s->picture_structure;
2784     if(FIELD_PICTURE){
2785         if (!(pic_num & 1))
2786             /* opposite field */
2787             *structure ^= PICT_FRAME;
2788         pic_num >>= 1;
2789     }
2790
2791     return pic_num;
2792 }
2793
2794 static int decode_ref_pic_list_reordering(H264Context *h){
2795     MpegEncContext * const s = &h->s;
2796     int list, index, pic_structure;
2797
2798     print_short_term(h);
2799     print_long_term(h);
2800
2801     for(list=0; list<h->list_count; list++){
2802         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
2803
2804         if(get_bits1(&s->gb)){
2805             int pred= h->curr_pic_num;
2806
2807             for(index=0; ; index++){
2808                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
2809                 unsigned int pic_id;
2810                 int i;
2811                 Picture *ref = NULL;
2812
2813                 if(reordering_of_pic_nums_idc==3)
2814                     break;
2815
2816                 if(index >= h->ref_count[list]){
2817                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
2818                     return -1;
2819                 }
2820
2821                 if(reordering_of_pic_nums_idc<3){
2822                     if(reordering_of_pic_nums_idc<2){
2823                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
2824                         int frame_num;
2825
2826                         if(abs_diff_pic_num > h->max_pic_num){
2827                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
2828                             return -1;
2829                         }
2830
2831                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
2832                         else                                pred+= abs_diff_pic_num;
2833                         pred &= h->max_pic_num - 1;
2834
2835                         frame_num = pic_num_extract(h, pred, &pic_structure);
2836
2837                         for(i= h->short_ref_count-1; i>=0; i--){
2838                             ref = h->short_ref[i];
2839                             assert(ref->reference);
2840                             assert(!ref->long_ref);
2841                             if(
2842                                    ref->frame_num == frame_num &&
2843                                    (ref->reference & pic_structure)
2844                               )
2845                                 break;
2846                         }
2847                         if(i>=0)
2848                             ref->pic_id= pred;
2849                     }else{
2850                         int long_idx;
2851                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
2852
2853                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
2854
2855                         if(long_idx>31){
2856                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
2857                             return -1;
2858                         }
2859                         ref = h->long_ref[long_idx];
2860                         assert(!(ref && !ref->reference));
2861                         if(ref && (ref->reference & pic_structure)){
2862                             ref->pic_id= pic_id;
2863                             assert(ref->long_ref);
2864                             i=0;
2865                         }else{
2866                             i=-1;
2867                         }
2868                     }
2869
2870                     if (i < 0) {
2871                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
2872                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
2873                     } else {
2874                         for(i=index; i+1<h->ref_count[list]; i++){
2875                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
2876                                 break;
2877                         }
2878                         for(; i > index; i--){
2879                             h->ref_list[list][i]= h->ref_list[list][i-1];
2880                         }
2881                         h->ref_list[list][index]= *ref;
2882                         if (FIELD_PICTURE){
2883                             pic_as_field(&h->ref_list[list][index], pic_structure);
2884                         }
2885                     }
2886                 }else{
2887                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
2888                     return -1;
2889                 }
2890             }
2891         }
2892     }
2893     for(list=0; list<h->list_count; list++){
2894         for(index= 0; index < h->ref_count[list]; index++){
2895             if(!h->ref_list[list][index].data[0]){
2896                 av_log(h->s.avctx, AV_LOG_ERROR, "Missing reference picture\n");
2897                 h->ref_list[list][index]= s->current_picture; //FIXME this is not a sensible solution
2898             }
2899         }
2900     }
2901
2902     return 0;
2903 }
2904
2905 static void fill_mbaff_ref_list(H264Context *h){
2906     int list, i, j;
2907     for(list=0; list<2; list++){ //FIXME try list_count
2908         for(i=0; i<h->ref_count[list]; i++){
2909             Picture *frame = &h->ref_list[list][i];
2910             Picture *field = &h->ref_list[list][16+2*i];
2911             field[0] = *frame;
2912             for(j=0; j<3; j++)
2913                 field[0].linesize[j] <<= 1;
2914             field[0].reference = PICT_TOP_FIELD;
2915             field[0].poc= field[0].field_poc[0];
2916             field[1] = field[0];
2917             for(j=0; j<3; j++)
2918                 field[1].data[j] += frame->linesize[j];
2919             field[1].reference = PICT_BOTTOM_FIELD;
2920             field[1].poc= field[1].field_poc[1];
2921
2922             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
2923             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
2924             for(j=0; j<2; j++){
2925                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
2926                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
2927             }
2928         }
2929     }
2930     for(j=0; j<h->ref_count[1]; j++){
2931         for(i=0; i<h->ref_count[0]; i++)
2932             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
2933         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
2934         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
2935     }
2936 }
2937
2938 static int pred_weight_table(H264Context *h){
2939     MpegEncContext * const s = &h->s;
2940     int list, i;
2941     int luma_def, chroma_def;
2942
2943     h->use_weight= 0;
2944     h->use_weight_chroma= 0;
2945     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
2946     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
2947     luma_def = 1<<h->luma_log2_weight_denom;
2948     chroma_def = 1<<h->chroma_log2_weight_denom;
2949
2950     for(list=0; list<2; list++){
2951         for(i=0; i<h->ref_count[list]; i++){
2952             int luma_weight_flag, chroma_weight_flag;
2953
2954             luma_weight_flag= get_bits1(&s->gb);
2955             if(luma_weight_flag){
2956                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
2957                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
2958                 if(   h->luma_weight[list][i] != luma_def
2959                    || h->luma_offset[list][i] != 0)
2960                     h->use_weight= 1;
2961             }else{
2962                 h->luma_weight[list][i]= luma_def;
2963                 h->luma_offset[list][i]= 0;
2964             }
2965
2966             if(CHROMA){
2967                 chroma_weight_flag= get_bits1(&s->gb);
2968                 if(chroma_weight_flag){
2969                     int j;
2970                     for(j=0; j<2; j++){
2971                         h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
2972                         h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
2973                         if(   h->chroma_weight[list][i][j] != chroma_def
2974                         || h->chroma_offset[list][i][j] != 0)
2975                             h->use_weight_chroma= 1;
2976                     }
2977                 }else{
2978                     int j;
2979                     for(j=0; j<2; j++){
2980                         h->chroma_weight[list][i][j]= chroma_def;
2981                         h->chroma_offset[list][i][j]= 0;
2982                     }
2983                 }
2984             }
2985         }
2986         if(h->slice_type_nos != FF_B_TYPE) break;
2987     }
2988     h->use_weight= h->use_weight || h->use_weight_chroma;
2989     return 0;
2990 }
2991
2992 static void implicit_weight_table(H264Context *h){
2993     MpegEncContext * const s = &h->s;
2994     int ref0, ref1;
2995     int cur_poc = s->current_picture_ptr->poc;
2996
2997     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
2998        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
2999         h->use_weight= 0;
3000         h->use_weight_chroma= 0;
3001         return;
3002     }
3003
3004     h->use_weight= 2;
3005     h->use_weight_chroma= 2;
3006     h->luma_log2_weight_denom= 5;
3007     h->chroma_log2_weight_denom= 5;
3008
3009     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3010         int poc0 = h->ref_list[0][ref0].poc;
3011         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3012             int poc1 = h->ref_list[1][ref1].poc;
3013             int td = av_clip(poc1 - poc0, -128, 127);
3014             if(td){
3015                 int tb = av_clip(cur_poc - poc0, -128, 127);
3016                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3017                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3018                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3019                     h->implicit_weight[ref0][ref1] = 32;
3020                 else
3021                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3022             }else
3023                 h->implicit_weight[ref0][ref1] = 32;
3024         }
3025     }
3026 }
3027
3028 /**
3029  * Mark a picture as no longer needed for reference. The refmask
3030  * argument allows unreferencing of individual fields or the whole frame.
3031  * If the picture becomes entirely unreferenced, but is being held for
3032  * display purposes, it is marked as such.
3033  * @param refmask mask of fields to unreference; the mask is bitwise
3034  *                anded with the reference marking of pic
3035  * @return non-zero if pic becomes entirely unreferenced (except possibly
3036  *         for display purposes) zero if one of the fields remains in
3037  *         reference
3038  */
3039 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3040     int i;
3041     if (pic->reference &= refmask) {
3042         return 0;
3043     } else {
3044         for(i = 0; h->delayed_pic[i]; i++)
3045             if(pic == h->delayed_pic[i]){
3046                 pic->reference=DELAYED_PIC_REF;
3047                 break;
3048             }
3049         return 1;
3050     }
3051 }
3052
3053 /**
3054  * instantaneous decoder refresh.
3055  */
3056 static void idr(H264Context *h){
3057     int i;
3058
3059     for(i=0; i<16; i++){
3060         remove_long(h, i, 0);
3061     }
3062     assert(h->long_ref_count==0);
3063
3064     for(i=0; i<h->short_ref_count; i++){
3065         unreference_pic(h, h->short_ref[i], 0);
3066         h->short_ref[i]= NULL;
3067     }
3068     h->short_ref_count=0;
3069     h->prev_frame_num= 0;
3070     h->prev_frame_num_offset= 0;
3071     h->prev_poc_msb=
3072     h->prev_poc_lsb= 0;
3073 }
3074
3075 /* forget old pics after a seek */
3076 static void flush_dpb(AVCodecContext *avctx){
3077     H264Context *h= avctx->priv_data;
3078     int i;
3079     for(i=0; i<MAX_DELAYED_PIC_COUNT; i++) {
3080         if(h->delayed_pic[i])
3081             h->delayed_pic[i]->reference= 0;
3082         h->delayed_pic[i]= NULL;
3083     }
3084     h->outputed_poc= INT_MIN;
3085     idr(h);
3086     if(h->s.current_picture_ptr)
3087         h->s.current_picture_ptr->reference= 0;
3088     h->s.first_field= 0;
3089     ff_mpeg_flush(avctx);
3090 }
3091
3092 /**
3093  * Find a Picture in the short term reference list by frame number.
3094  * @param frame_num frame number to search for
3095  * @param idx the index into h->short_ref where returned picture is found
3096  *            undefined if no picture found.
3097  * @return pointer to the found picture, or NULL if no pic with the provided
3098  *                 frame number is found
3099  */
3100 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3101     MpegEncContext * const s = &h->s;
3102     int i;
3103
3104     for(i=0; i<h->short_ref_count; i++){
3105         Picture *pic= h->short_ref[i];
3106         if(s->avctx->debug&FF_DEBUG_MMCO)
3107             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3108         if(pic->frame_num == frame_num) {
3109             *idx = i;
3110             return pic;
3111         }
3112     }
3113     return NULL;
3114 }
3115
3116 /**
3117  * Remove a picture from the short term reference list by its index in
3118  * that list.  This does no checking on the provided index; it is assumed
3119  * to be valid. Other list entries are shifted down.
3120  * @param i index into h->short_ref of picture to remove.
3121  */
3122 static void remove_short_at_index(H264Context *h, int i){
3123     assert(i >= 0 && i < h->short_ref_count);
3124     h->short_ref[i]= NULL;
3125     if (--h->short_ref_count)
3126         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3127 }
3128
3129 /**
3130  *
3131  * @return the removed picture or NULL if an error occurs
3132  */
3133 static Picture * remove_short(H264Context *h, int frame_num, int ref_mask){
3134     MpegEncContext * const s = &h->s;
3135     Picture *pic;
3136     int i;
3137
3138     if(s->avctx->debug&FF_DEBUG_MMCO)
3139         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3140
3141     pic = find_short(h, frame_num, &i);
3142     if (pic){
3143         if(unreference_pic(h, pic, ref_mask))
3144         remove_short_at_index(h, i);
3145     }
3146
3147     return pic;
3148 }
3149
3150 /**
3151  * Remove a picture from the long term reference list by its index in
3152  * that list.
3153  * @return the removed picture or NULL if an error occurs
3154  */
3155 static Picture * remove_long(H264Context *h, int i, int ref_mask){
3156     Picture *pic;
3157
3158     pic= h->long_ref[i];
3159     if (pic){
3160         if(unreference_pic(h, pic, ref_mask)){
3161             assert(h->long_ref[i]->long_ref == 1);
3162             h->long_ref[i]->long_ref= 0;
3163             h->long_ref[i]= NULL;
3164             h->long_ref_count--;
3165         }
3166     }
3167
3168     return pic;
3169 }
3170
3171 /**
3172  * print short term list
3173  */
3174 static void print_short_term(H264Context *h) {
3175     uint32_t i;
3176     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3177         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3178         for(i=0; i<h->short_ref_count; i++){
3179             Picture *pic= h->short_ref[i];
3180             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3181         }
3182     }
3183 }
3184
3185 /**
3186  * print long term list
3187  */
3188 static void print_long_term(H264Context *h) {
3189     uint32_t i;
3190     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3191         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3192         for(i = 0; i < 16; i++){
3193             Picture *pic= h->long_ref[i];
3194             if (pic) {
3195                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3196             }
3197         }
3198     }
3199 }
3200
3201 /**
3202  * Executes the reference picture marking (memory management control operations).
3203  */
3204 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3205     MpegEncContext * const s = &h->s;
3206     int i, j;
3207     int current_ref_assigned=0;
3208     Picture *pic;
3209
3210     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3211         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3212
3213     for(i=0; i<mmco_count; i++){
3214         int structure, frame_num;
3215         if(s->avctx->debug&FF_DEBUG_MMCO)
3216             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3217
3218         if(   mmco[i].opcode == MMCO_SHORT2UNUSED
3219            || mmco[i].opcode == MMCO_SHORT2LONG){
3220             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3221             pic = find_short(h, frame_num, &j);
3222             if(!pic){
3223                 if(mmco[i].opcode != MMCO_SHORT2LONG || !h->long_ref[mmco[i].long_arg]
3224                    || h->long_ref[mmco[i].long_arg]->frame_num != frame_num)
3225                 av_log(h->s.avctx, AV_LOG_ERROR, "mmco: unref short failure\n");
3226                 continue;
3227             }
3228         }
3229
3230         switch(mmco[i].opcode){
3231         case MMCO_SHORT2UNUSED:
3232             if(s->avctx->debug&FF_DEBUG_MMCO)
3233                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3234             remove_short(h, frame_num, structure ^ PICT_FRAME);
3235             break;
3236         case MMCO_SHORT2LONG:
3237                 if (h->long_ref[mmco[i].long_arg] != pic)
3238                     remove_long(h, mmco[i].long_arg, 0);
3239
3240                 remove_short_at_index(h, j);
3241                 h->long_ref[ mmco[i].long_arg ]= pic;
3242                 if (h->long_ref[ mmco[i].long_arg ]){
3243                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3244                     h->long_ref_count++;
3245                 }
3246             break;
3247         case MMCO_LONG2UNUSED:
3248             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3249             pic = h->long_ref[j];
3250             if (pic) {
3251                 remove_long(h, j, structure ^ PICT_FRAME);
3252             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3253                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3254             break;
3255         case MMCO_LONG:
3256                     // Comment below left from previous code as it is an interresting note.
3257                     /* First field in pair is in short term list or
3258                      * at a different long term index.
3259                      * This is not allowed; see 7.4.3.3, notes 2 and 3.
3260                      * Report the problem and keep the pair where it is,
3261                      * and mark this field valid.
3262                      */
3263
3264             if (h->long_ref[mmco[i].long_arg] != s->current_picture_ptr) {
3265                 remove_long(h, mmco[i].long_arg, 0);
3266
3267                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3268                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3269                 h->long_ref_count++;
3270             }
3271
3272             s->current_picture_ptr->reference |= s->picture_structure;
3273             current_ref_assigned=1;
3274             break;
3275         case MMCO_SET_MAX_LONG:
3276             assert(mmco[i].long_arg <= 16);
3277             // just remove the long term which index is greater than new max
3278             for(j = mmco[i].long_arg; j<16; j++){
3279                 remove_long(h, j, 0);
3280             }
3281             break;
3282         case MMCO_RESET:
3283             while(h->short_ref_count){
3284                 remove_short(h, h->short_ref[0]->frame_num, 0);
3285             }
3286             for(j = 0; j < 16; j++) {
3287                 remove_long(h, j, 0);
3288             }
3289             s->current_picture_ptr->poc=
3290             s->current_picture_ptr->field_poc[0]=
3291             s->current_picture_ptr->field_poc[1]=
3292             h->poc_lsb=
3293             h->poc_msb=
3294             h->frame_num=
3295             s->current_picture_ptr->frame_num= 0;
3296             break;
3297         default: assert(0);
3298         }
3299     }
3300
3301     if (!current_ref_assigned) {
3302         /* Second field of complementary field pair; the first field of
3303          * which is already referenced. If short referenced, it
3304          * should be first entry in short_ref. If not, it must exist
3305          * in long_ref; trying to put it on the short list here is an
3306          * error in the encoded bit stream (ref: 7.4.3.3, NOTE 2 and 3).
3307          */
3308         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3309             /* Just mark the second field valid */
3310             s->current_picture_ptr->reference = PICT_FRAME;
3311         } else if (s->current_picture_ptr->long_ref) {
3312             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3313                                              "assignment for second field "
3314                                              "in complementary field pair "
3315                                              "(first field is long term)\n");
3316         } else {
3317             pic= remove_short(h, s->current_picture_ptr->frame_num, 0);
3318             if(pic){
3319                 av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3320             }
3321
3322             if(h->short_ref_count)
3323                 memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3324
3325             h->short_ref[0]= s->current_picture_ptr;
3326             h->short_ref_count++;
3327             s->current_picture_ptr->reference |= s->picture_structure;
3328         }
3329     }
3330
3331     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3332
3333         /* We have too many reference frames, probably due to corrupted
3334          * stream. Need to discard one frame. Prevents overrun of the
3335          * short_ref and long_ref buffers.
3336          */
3337         av_log(h->s.avctx, AV_LOG_ERROR,
3338                "number of reference frames exceeds max (probably "
3339                "corrupt input), discarding one\n");
3340
3341         if (h->long_ref_count && !h->short_ref_count) {
3342             for (i = 0; i < 16; ++i)
3343                 if (h->long_ref[i])
3344                     break;
3345
3346             assert(i < 16);
3347             remove_long(h, i, 0);
3348         } else {
3349             pic = h->short_ref[h->short_ref_count - 1];
3350             remove_short(h, pic->frame_num, 0);
3351         }
3352     }
3353
3354     print_short_term(h);
3355     print_long_term(h);
3356     return 0;
3357 }
3358
3359 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3360     MpegEncContext * const s = &h->s;
3361     int i;
3362
3363     h->mmco_index= 0;
3364     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3365         s->broken_link= get_bits1(gb) -1;
3366         if(get_bits1(gb)){
3367             h->mmco[0].opcode= MMCO_LONG;
3368             h->mmco[0].long_arg= 0;
3369             h->mmco_index= 1;
3370         }
3371     }else{
3372         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3373             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3374                 MMCOOpcode opcode= get_ue_golomb(gb);
3375
3376                 h->mmco[i].opcode= opcode;
3377                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3378                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3379 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3380                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3381                         return -1;
3382                     }*/
3383                 }
3384                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3385                     unsigned int long_arg= get_ue_golomb(gb);
3386                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3387                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3388                         return -1;
3389                     }
3390                     h->mmco[i].long_arg= long_arg;
3391                 }
3392
3393                 if(opcode > (unsigned)MMCO_LONG){
3394                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3395                     return -1;
3396                 }
3397                 if(opcode == MMCO_END)
3398                     break;
3399             }
3400             h->mmco_index= i;
3401         }else{
3402             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3403
3404             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3405                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3406                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3407                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3408                 h->mmco_index= 1;
3409                 if (FIELD_PICTURE) {
3410                     h->mmco[0].short_pic_num *= 2;
3411                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3412                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3413                     h->mmco_index= 2;
3414                 }
3415             }
3416         }
3417     }
3418
3419     return 0;
3420 }
3421
3422 static int init_poc(H264Context *h){
3423     MpegEncContext * const s = &h->s;
3424     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3425     int field_poc[2];
3426     Picture *cur = s->current_picture_ptr;
3427
3428     h->frame_num_offset= h->prev_frame_num_offset;
3429     if(h->frame_num < h->prev_frame_num)
3430         h->frame_num_offset += max_frame_num;
3431
3432     if(h->sps.poc_type==0){
3433         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3434
3435         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3436             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3437         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3438             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3439         else
3440             h->poc_msb = h->prev_poc_msb;
3441 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3442         field_poc[0] =
3443         field_poc[1] = h->poc_msb + h->poc_lsb;
3444         if(s->picture_structure == PICT_FRAME)
3445             field_poc[1] += h->delta_poc_bottom;
3446     }else if(h->sps.poc_type==1){
3447         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3448         int i;
3449
3450         if(h->sps.poc_cycle_length != 0)
3451             abs_frame_num = h->frame_num_offset + h->frame_num;
3452         else
3453             abs_frame_num = 0;
3454
3455         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3456             abs_frame_num--;
3457
3458         expected_delta_per_poc_cycle = 0;
3459         for(i=0; i < h->sps.poc_cycle_length; i++)
3460             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3461
3462         if(abs_frame_num > 0){
3463             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3464             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3465
3466             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3467             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3468                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3469         } else
3470             expectedpoc = 0;
3471
3472         if(h->nal_ref_idc == 0)
3473             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3474
3475         field_poc[0] = expectedpoc + h->delta_poc[0];
3476         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3477
3478         if(s->picture_structure == PICT_FRAME)
3479             field_poc[1] += h->delta_poc[1];
3480     }else{
3481         int poc= 2*(h->frame_num_offset + h->frame_num);
3482
3483         if(!h->nal_ref_idc)
3484             poc--;
3485
3486         field_poc[0]= poc;
3487         field_poc[1]= poc;
3488     }
3489
3490     if(s->picture_structure != PICT_BOTTOM_FIELD)
3491         s->current_picture_ptr->field_poc[0]= field_poc[0];
3492     if(s->picture_structure != PICT_TOP_FIELD)
3493         s->current_picture_ptr->field_poc[1]= field_poc[1];
3494     cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3495
3496     return 0;
3497 }
3498
3499
3500 /**
3501  * initialize scan tables
3502  */
3503 static void init_scan_tables(H264Context *h){
3504     MpegEncContext * const s = &h->s;
3505     int i;
3506     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3507         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3508         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3509     }else{
3510         for(i=0; i<16; i++){
3511 #define T(x) (x>>2) | ((x<<2) & 0xF)
3512             h->zigzag_scan[i] = T(zigzag_scan[i]);
3513             h-> field_scan[i] = T( field_scan[i]);
3514 #undef T
3515         }
3516     }
3517     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3518         memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
3519         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3520         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3521         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3522     }else{
3523         for(i=0; i<64; i++){
3524 #define T(x) (x>>3) | ((x&7)<<3)
3525             h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
3526             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3527             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3528             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3529 #undef T
3530         }
3531     }
3532     if(h->sps.transform_bypass){ //FIXME same ugly
3533         h->zigzag_scan_q0          = zigzag_scan;
3534         h->zigzag_scan8x8_q0       = zigzag_scan8x8;
3535         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3536         h->field_scan_q0           = field_scan;
3537         h->field_scan8x8_q0        = field_scan8x8;
3538         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3539     }else{
3540         h->zigzag_scan_q0          = h->zigzag_scan;
3541         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3542         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3543         h->field_scan_q0           = h->field_scan;
3544         h->field_scan8x8_q0        = h->field_scan8x8;
3545         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3546     }
3547 }
3548
3549 /**
3550  * Replicates H264 "master" context to thread contexts.
3551  */
3552 static void clone_slice(H264Context *dst, H264Context *src)
3553 {
3554     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3555     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3556     dst->s.current_picture      = src->s.current_picture;
3557     dst->s.linesize             = src->s.linesize;
3558     dst->s.uvlinesize           = src->s.uvlinesize;
3559     dst->s.first_field          = src->s.first_field;
3560
3561     dst->prev_poc_msb           = src->prev_poc_msb;
3562     dst->prev_poc_lsb           = src->prev_poc_lsb;
3563     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3564     dst->prev_frame_num         = src->prev_frame_num;
3565     dst->short_ref_count        = src->short_ref_count;
3566
3567     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3568     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3569     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3570     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3571
3572     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3573     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3574 }
3575
3576 /**
3577  * decodes a slice header.
3578  * This will also call MPV_common_init() and frame_start() as needed.
3579  *
3580  * @param h h264context
3581  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3582  *
3583  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
3584  */
3585 static int decode_slice_header(H264Context *h, H264Context *h0){
3586     MpegEncContext * const s = &h->s;
3587     MpegEncContext * const s0 = &h0->s;
3588     unsigned int first_mb_in_slice;
3589     unsigned int pps_id;
3590     int num_ref_idx_active_override_flag;
3591     unsigned int slice_type, tmp, i, j;
3592     int default_ref_list_done = 0;
3593     int last_pic_structure;
3594
3595     s->dropable= h->nal_ref_idc == 0;
3596
3597     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3598         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3599         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3600     }else{
3601         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3602         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3603     }
3604
3605     first_mb_in_slice= get_ue_golomb(&s->gb);
3606
3607     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3608         h0->current_slice = 0;
3609         if (!s0->first_field)
3610             s->current_picture_ptr= NULL;
3611     }
3612
3613     slice_type= get_ue_golomb(&s->gb);
3614     if(slice_type > 9){
3615         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3616         return -1;
3617     }
3618     if(slice_type > 4){
3619         slice_type -= 5;
3620         h->slice_type_fixed=1;
3621     }else
3622         h->slice_type_fixed=0;
3623
3624     slice_type= golomb_to_pict_type[ slice_type ];
3625     if (slice_type == FF_I_TYPE
3626         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3627         default_ref_list_done = 1;
3628     }
3629     h->slice_type= slice_type;
3630     h->slice_type_nos= slice_type & 3;
3631
3632     s->pict_type= h->slice_type; // to make a few old functions happy, it's wrong though
3633     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3634         av_log(h->s.avctx, AV_LOG_ERROR,
3635                "B picture before any references, skipping\n");
3636         return -1;
3637     }
3638
3639     pps_id= get_ue_golomb(&s->gb);
3640     if(pps_id>=MAX_PPS_COUNT){
3641         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3642         return -1;
3643     }
3644     if(!h0->pps_buffers[pps_id]) {
3645         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing PPS referenced\n");
3646         return -1;
3647     }
3648     h->pps= *h0->pps_buffers[pps_id];
3649
3650     if(!h0->sps_buffers[h->pps.sps_id]) {
3651         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS referenced\n");
3652         return -1;
3653     }
3654     h->sps = *h0->sps_buffers[h->pps.sps_id];
3655
3656     if(h == h0 && h->dequant_coeff_pps != pps_id){
3657         h->dequant_coeff_pps = pps_id;
3658         init_dequant_tables(h);
3659     }
3660
3661     s->mb_width= h->sps.mb_width;
3662     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3663
3664     h->b_stride=  s->mb_width*4;
3665     h->b8_stride= s->mb_width*2;
3666
3667     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
3668     if(h->sps.frame_mbs_only_flag)
3669         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
3670     else
3671         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 3);
3672
3673     if (s->context_initialized
3674         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3675         if(h != h0)
3676             return -1;   // width / height changed during parallelized decoding
3677         free_tables(h);
3678         flush_dpb(s->avctx);
3679         MPV_common_end(s);
3680     }
3681     if (!s->context_initialized) {
3682         if(h != h0)
3683             return -1;  // we cant (re-)initialize context during parallel decoding
3684         if (MPV_common_init(s) < 0)
3685             return -1;
3686         s->first_field = 0;
3687
3688         init_scan_tables(h);
3689         alloc_tables(h);
3690
3691         for(i = 1; i < s->avctx->thread_count; i++) {
3692             H264Context *c;
3693             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3694             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3695             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3696             c->sps = h->sps;
3697             c->pps = h->pps;
3698             init_scan_tables(c);
3699             clone_tables(c, h);
3700         }
3701
3702         for(i = 0; i < s->avctx->thread_count; i++)
3703             if(context_init(h->thread_context[i]) < 0)
3704                 return -1;
3705
3706         s->avctx->width = s->width;
3707         s->avctx->height = s->height;
3708         s->avctx->sample_aspect_ratio= h->sps.sar;
3709         if(!s->avctx->sample_aspect_ratio.den)
3710             s->avctx->sample_aspect_ratio.den = 1;
3711
3712         if(h->sps.timing_info_present_flag){
3713             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
3714             if(h->x264_build > 0 && h->x264_build < 44)
3715                 s->avctx->time_base.den *= 2;
3716             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3717                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3718         }
3719     }
3720
3721     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3722
3723     h->mb_mbaff = 0;
3724     h->mb_aff_frame = 0;
3725     last_pic_structure = s0->picture_structure;
3726     if(h->sps.frame_mbs_only_flag){
3727         s->picture_structure= PICT_FRAME;
3728     }else{
3729         if(get_bits1(&s->gb)) { //field_pic_flag
3730             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3731         } else {
3732             s->picture_structure= PICT_FRAME;
3733             h->mb_aff_frame = h->sps.mb_aff;
3734         }
3735     }
3736     h->mb_field_decoding_flag= s->picture_structure != PICT_FRAME;
3737
3738     if(h0->current_slice == 0){
3739         while(h->frame_num !=  h->prev_frame_num &&
3740               h->frame_num != (h->prev_frame_num+1)%(1<<h->sps.log2_max_frame_num)){
3741             av_log(NULL, AV_LOG_DEBUG, "Frame num gap %d %d\n", h->frame_num, h->prev_frame_num);
3742             frame_start(h);
3743             h->prev_frame_num++;
3744             h->prev_frame_num %= 1<<h->sps.log2_max_frame_num;
3745             s->current_picture_ptr->frame_num= h->prev_frame_num;
3746             execute_ref_pic_marking(h, NULL, 0);
3747         }
3748
3749         /* See if we have a decoded first field looking for a pair... */
3750         if (s0->first_field) {
3751             assert(s0->current_picture_ptr);
3752             assert(s0->current_picture_ptr->data[0]);
3753             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
3754
3755             /* figure out if we have a complementary field pair */
3756             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
3757                 /*
3758                  * Previous field is unmatched. Don't display it, but let it
3759                  * remain for reference if marked as such.
3760                  */
3761                 s0->current_picture_ptr = NULL;
3762                 s0->first_field = FIELD_PICTURE;
3763
3764             } else {
3765                 if (h->nal_ref_idc &&
3766                         s0->current_picture_ptr->reference &&
3767                         s0->current_picture_ptr->frame_num != h->frame_num) {
3768                     /*
3769                      * This and previous field were reference, but had
3770                      * different frame_nums. Consider this field first in
3771                      * pair. Throw away previous field except for reference
3772                      * purposes.
3773                      */
3774                     s0->first_field = 1;
3775                     s0->current_picture_ptr = NULL;
3776
3777                 } else {
3778                     /* Second field in complementary pair */
3779                     s0->first_field = 0;
3780                 }
3781             }
3782
3783         } else {
3784             /* Frame or first field in a potentially complementary pair */
3785             assert(!s0->current_picture_ptr);
3786             s0->first_field = FIELD_PICTURE;
3787         }
3788
3789         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
3790             s0->first_field = 0;
3791             return -1;
3792         }
3793     }
3794     if(h != h0)
3795         clone_slice(h, h0);
3796
3797     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
3798
3799     assert(s->mb_num == s->mb_width * s->mb_height);
3800     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
3801        first_mb_in_slice                    >= s->mb_num){
3802         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
3803         return -1;
3804     }
3805     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
3806     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
3807     if (s->picture_structure == PICT_BOTTOM_FIELD)
3808         s->resync_mb_y = s->mb_y = s->mb_y + 1;
3809     assert(s->mb_y < s->mb_height);
3810
3811     if(s->picture_structure==PICT_FRAME){
3812         h->curr_pic_num=   h->frame_num;
3813         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
3814     }else{
3815         h->curr_pic_num= 2*h->frame_num + 1;
3816         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
3817     }
3818
3819     if(h->nal_unit_type == NAL_IDR_SLICE){
3820         get_ue_golomb(&s->gb); /* idr_pic_id */
3821     }
3822
3823     if(h->sps.poc_type==0){
3824         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
3825
3826         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
3827             h->delta_poc_bottom= get_se_golomb(&s->gb);
3828         }
3829     }
3830
3831     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
3832         h->delta_poc[0]= get_se_golomb(&s->gb);
3833
3834         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
3835             h->delta_poc[1]= get_se_golomb(&s->gb);
3836     }
3837
3838     init_poc(h);
3839
3840     if(h->pps.redundant_pic_cnt_present){
3841         h->redundant_pic_count= get_ue_golomb(&s->gb);
3842     }
3843
3844     //set defaults, might be overridden a few lines later
3845     h->ref_count[0]= h->pps.ref_count[0];
3846     h->ref_count[1]= h->pps.ref_count[1];
3847
3848     if(h->slice_type_nos != FF_I_TYPE){
3849         if(h->slice_type_nos == FF_B_TYPE){
3850             h->direct_spatial_mv_pred= get_bits1(&s->gb);
3851         }
3852         num_ref_idx_active_override_flag= get_bits1(&s->gb);
3853
3854         if(num_ref_idx_active_override_flag){
3855             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
3856             if(h->slice_type_nos==FF_B_TYPE)
3857                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
3858
3859             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
3860                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
3861                 h->ref_count[0]= h->ref_count[1]= 1;
3862                 return -1;
3863             }
3864         }
3865         if(h->slice_type_nos == FF_B_TYPE)
3866             h->list_count= 2;
3867         else
3868             h->list_count= 1;
3869     }else
3870         h->list_count= 0;
3871
3872     if(!default_ref_list_done){
3873         fill_default_ref_list(h);
3874     }
3875
3876     if(h->slice_type_nos!=FF_I_TYPE && decode_ref_pic_list_reordering(h) < 0)
3877         return -1;
3878
3879     if(h->slice_type_nos!=FF_I_TYPE){
3880         s->last_picture_ptr= &h->ref_list[0][0];
3881         ff_copy_picture(&s->last_picture, s->last_picture_ptr);
3882     }
3883     if(h->slice_type_nos==FF_B_TYPE){
3884         s->next_picture_ptr= &h->ref_list[1][0];
3885         ff_copy_picture(&s->next_picture, s->next_picture_ptr);
3886     }
3887
3888     if(   (h->pps.weighted_pred          && h->slice_type_nos == FF_P_TYPE )
3889        ||  (h->pps.weighted_bipred_idc==1 && h->slice_type_nos== FF_B_TYPE ) )
3890         pred_weight_table(h);
3891     else if(h->pps.weighted_bipred_idc==2 && h->slice_type_nos== FF_B_TYPE)
3892         implicit_weight_table(h);
3893     else
3894         h->use_weight = 0;
3895
3896     if(h->nal_ref_idc)
3897         decode_ref_pic_marking(h0, &s->gb);
3898
3899     if(FRAME_MBAFF)
3900         fill_mbaff_ref_list(h);
3901
3902     if(h->slice_type_nos==FF_B_TYPE && !h->direct_spatial_mv_pred)
3903         direct_dist_scale_factor(h);
3904     direct_ref_list_init(h);
3905
3906     if( h->slice_type_nos != FF_I_TYPE && h->pps.cabac ){
3907         tmp = get_ue_golomb(&s->gb);
3908         if(tmp > 2){
3909             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
3910             return -1;
3911         }
3912         h->cabac_init_idc= tmp;
3913     }
3914
3915     h->last_qscale_diff = 0;
3916     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
3917     if(tmp>51){
3918         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
3919         return -1;
3920     }
3921     s->qscale= tmp;
3922     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
3923     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
3924     //FIXME qscale / qp ... stuff
3925     if(h->slice_type == FF_SP_TYPE){
3926         get_bits1(&s->gb); /* sp_for_switch_flag */
3927     }
3928     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
3929         get_se_golomb(&s->gb); /* slice_qs_delta */
3930     }
3931
3932     h->deblocking_filter = 1;
3933     h->slice_alpha_c0_offset = 0;
3934     h->slice_beta_offset = 0;
3935     if( h->pps.deblocking_filter_parameters_present ) {
3936         tmp= get_ue_golomb(&s->gb);
3937         if(tmp > 2){
3938             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
3939             return -1;
3940         }
3941         h->deblocking_filter= tmp;
3942         if(h->deblocking_filter < 2)
3943             h->deblocking_filter^= 1; // 1<->0
3944
3945         if( h->deblocking_filter ) {
3946             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
3947             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
3948         }
3949     }
3950
3951     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
3952        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type_nos != FF_I_TYPE)
3953        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type_nos == FF_B_TYPE)
3954        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
3955         h->deblocking_filter= 0;
3956
3957     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
3958         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
3959             /* Cheat slightly for speed:
3960                Do not bother to deblock across slices. */
3961             h->deblocking_filter = 2;
3962         } else {
3963             h0->max_contexts = 1;
3964             if(!h0->single_decode_warning) {
3965                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
3966                 h0->single_decode_warning = 1;
3967             }
3968             if(h != h0)
3969                 return 1; // deblocking switched inside frame
3970         }
3971     }
3972
3973 #if 0 //FMO
3974     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
3975         slice_group_change_cycle= get_bits(&s->gb, ?);
3976 #endif
3977
3978     h0->last_slice_type = slice_type;
3979     h->slice_num = ++h0->current_slice;
3980     if(h->slice_num >= MAX_SLICES){
3981         av_log(s->avctx, AV_LOG_ERROR, "Too many slices, increase MAX_SLICES and recompile\n");
3982     }
3983
3984     for(j=0; j<2; j++){
3985         int *ref2frm= h->ref2frm[h->slice_num&(MAX_SLICES-1)][j];
3986         ref2frm[0]=
3987         ref2frm[1]= -1;
3988         for(i=0; i<16; i++)
3989             ref2frm[i+2]= 4*h->ref_list[j][i].frame_num
3990                           +(h->ref_list[j][i].reference&3);
3991         ref2frm[18+0]=
3992         ref2frm[18+1]= -1;
3993         for(i=16; i<48; i++)
3994             ref2frm[i+4]= 4*h->ref_list[j][i].frame_num
3995                           +(h->ref_list[j][i].reference&3);
3996     }
3997
3998     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
3999     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
4000
4001     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4002         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c%s%s pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
4003                h->slice_num,
4004                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4005                first_mb_in_slice,
4006                av_get_pict_type_char(h->slice_type), h->slice_type_fixed ? " fix" : "", h->nal_unit_type == NAL_IDR_SLICE ? " IDR" : "",
4007                pps_id, h->frame_num,
4008                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4009                h->ref_count[0], h->ref_count[1],
4010                s->qscale,
4011                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4012                h->use_weight,
4013                h->use_weight==1 && h->use_weight_chroma ? "c" : "",
4014                h->slice_type == FF_B_TYPE ? (h->direct_spatial_mv_pred ? "SPAT" : "TEMP") : ""
4015                );
4016     }
4017
4018     return 0;
4019 }
4020
4021 /**
4022  *
4023  */
4024 static inline int get_level_prefix(GetBitContext *gb){
4025     unsigned int buf;
4026     int log;
4027
4028     OPEN_READER(re, gb);
4029     UPDATE_CACHE(re, gb);
4030     buf=GET_CACHE(re, gb);
4031
4032     log= 32 - av_log2(buf);
4033 #ifdef TRACE
4034     print_bin(buf>>(32-log), log);
4035     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4036 #endif
4037
4038     LAST_SKIP_BITS(re, gb, log);
4039     CLOSE_READER(re, gb);
4040
4041     return log-1;
4042 }
4043
4044 static inline int get_dct8x8_allowed(H264Context *h){
4045     if(h->sps.direct_8x8_inference_flag)
4046         return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8                )*0x0001000100010001ULL));
4047     else
4048         return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8|MB_TYPE_DIRECT2)*0x0001000100010001ULL));
4049 }
4050
4051 /**
4052  * decodes a residual block.
4053  * @param n block index
4054  * @param scantable scantable
4055  * @param max_coeff number of coefficients in the block
4056  * @return <0 if an error occurred
4057  */
4058 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4059     MpegEncContext * const s = &h->s;
4060     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4061     int level[16];
4062     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4063
4064     //FIXME put trailing_onex into the context
4065
4066     if(n == CHROMA_DC_BLOCK_INDEX){
4067         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4068         total_coeff= coeff_token>>2;
4069     }else{
4070         if(n == LUMA_DC_BLOCK_INDEX){
4071             total_coeff= pred_non_zero_count(h, 0);
4072             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4073             total_coeff= coeff_token>>2;
4074         }else{
4075             total_coeff= pred_non_zero_count(h, n);
4076             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4077             total_coeff= coeff_token>>2;
4078             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4079         }
4080     }
4081
4082     //FIXME set last_non_zero?
4083
4084     if(total_coeff==0)
4085         return 0;
4086     if(total_coeff > (unsigned)max_coeff) {
4087         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4088         return -1;
4089     }
4090
4091     trailing_ones= coeff_token&3;
4092     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4093     assert(total_coeff<=16);
4094
4095     i = show_bits(gb, 3);
4096     skip_bits(gb, trailing_ones);
4097     level[0] = 1-((i&4)>>1);
4098     level[1] = 1-((i&2)   );
4099     level[2] = 1-((i&1)<<1);
4100
4101     if(trailing_ones<total_coeff) {
4102         int level_code, mask;
4103         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4104         int prefix= get_level_prefix(gb);
4105
4106         //first coefficient has suffix_length equal to 0 or 1
4107         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4108             if(suffix_length)
4109                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4110             else
4111                 level_code= (prefix<<suffix_length); //part
4112         }else if(prefix==14){
4113             if(suffix_length)
4114                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4115             else
4116                 level_code= prefix + get_bits(gb, 4); //part
4117         }else{
4118             level_code= (15<<suffix_length) + get_bits(gb, prefix-3); //part
4119             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4120             if(prefix>=16)
4121                 level_code += (1<<(prefix-3))-4096;
4122         }
4123
4124         if(trailing_ones < 3) level_code += 2;
4125
4126         suffix_length = 1;
4127         if(level_code > 5)
4128             suffix_length++;
4129         mask= -(level_code&1);
4130         level[trailing_ones]= (((2+level_code)>>1) ^ mask) - mask;
4131
4132         //remaining coefficients have suffix_length > 0
4133         for(i=trailing_ones+1;i<total_coeff;i++) {
4134             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
4135             prefix = get_level_prefix(gb);
4136             if(prefix<15){
4137                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4138             }else{
4139                 level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
4140                 if(prefix>=16)
4141                     level_code += (1<<(prefix-3))-4096;
4142             }
4143             mask= -(level_code&1);
4144             level[i]= (((2+level_code)>>1) ^ mask) - mask;
4145             if(level_code > suffix_limit[suffix_length])
4146                 suffix_length++;
4147         }
4148     }
4149
4150     if(total_coeff == max_coeff)
4151         zeros_left=0;
4152     else{
4153         if(n == CHROMA_DC_BLOCK_INDEX)
4154             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4155         else
4156             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4157     }
4158
4159     coeff_num = zeros_left + total_coeff - 1;
4160     j = scantable[coeff_num];
4161     if(n > 24){
4162         block[j] = level[0];
4163         for(i=1;i<total_coeff;i++) {
4164             if(zeros_left <= 0)
4165                 run_before = 0;
4166             else if(zeros_left < 7){
4167                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4168             }else{
4169                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4170             }
4171             zeros_left -= run_before;
4172             coeff_num -= 1 + run_before;
4173             j= scantable[ coeff_num ];
4174
4175             block[j]= level[i];
4176         }
4177     }else{
4178         block[j] = (level[0] * qmul[j] + 32)>>6;
4179         for(i=1;i<total_coeff;i++) {
4180             if(zeros_left <= 0)
4181                 run_before = 0;
4182             else if(zeros_left < 7){
4183                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4184             }else{
4185                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4186             }
4187             zeros_left -= run_before;
4188             coeff_num -= 1 + run_before;
4189             j= scantable[ coeff_num ];
4190
4191             block[j]= (level[i] * qmul[j] + 32)>>6;
4192         }
4193     }
4194
4195     if(zeros_left<0){
4196         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4197         return -1;
4198     }
4199
4200     return 0;
4201 }
4202
4203 static void predict_field_decoding_flag(H264Context *h){
4204     MpegEncContext * const s = &h->s;
4205     const int mb_xy= h->mb_xy;
4206     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4207                 ? s->current_picture.mb_type[mb_xy-1]
4208                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4209                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4210                 : 0;
4211     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4212 }
4213
4214 /**
4215  * decodes a P_SKIP or B_SKIP macroblock
4216  */
4217 static void decode_mb_skip(H264Context *h){
4218     MpegEncContext * const s = &h->s;
4219     const int mb_xy= h->mb_xy;
4220     int mb_type=0;
4221
4222     memset(h->non_zero_count[mb_xy], 0, 16);
4223     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4224
4225     if(MB_FIELD)
4226         mb_type|= MB_TYPE_INTERLACED;
4227
4228     if( h->slice_type_nos == FF_B_TYPE )
4229     {
4230         // just for fill_caches. pred_direct_motion will set the real mb_type
4231         mb_type|= MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4232
4233         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4234         pred_direct_motion(h, &mb_type);
4235         mb_type|= MB_TYPE_SKIP;
4236     }
4237     else
4238     {
4239         int mx, my;
4240         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4241
4242         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4243         pred_pskip_motion(h, &mx, &my);
4244         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4245         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4246     }
4247
4248     write_back_motion(h, mb_type);
4249     s->current_picture.mb_type[mb_xy]= mb_type;
4250     s->current_picture.qscale_table[mb_xy]= s->qscale;
4251     h->slice_table[ mb_xy ]= h->slice_num;
4252     h->prev_mb_skipped= 1;
4253 }
4254
4255 /**
4256  * decodes a macroblock
4257  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4258  */
4259 static int decode_mb_cavlc(H264Context *h){
4260     MpegEncContext * const s = &h->s;
4261     int mb_xy;
4262     int partition_count;
4263     unsigned int mb_type, cbp;
4264     int dct8x8_allowed= h->pps.transform_8x8_mode;
4265
4266     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4267
4268     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4269     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4270                 down the code */
4271     if(h->slice_type_nos != FF_I_TYPE){
4272         if(s->mb_skip_run==-1)
4273             s->mb_skip_run= get_ue_golomb(&s->gb);
4274
4275         if (s->mb_skip_run--) {
4276             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4277                 if(s->mb_skip_run==0)
4278                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4279                 else
4280                     predict_field_decoding_flag(h);
4281             }
4282             decode_mb_skip(h);
4283             return 0;
4284         }
4285     }
4286     if(FRAME_MBAFF){
4287         if( (s->mb_y&1) == 0 )
4288             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4289     }
4290
4291     h->prev_mb_skipped= 0;
4292
4293     mb_type= get_ue_golomb(&s->gb);
4294     if(h->slice_type_nos == FF_B_TYPE){
4295         if(mb_type < 23){
4296             partition_count= b_mb_type_info[mb_type].partition_count;
4297             mb_type=         b_mb_type_info[mb_type].type;
4298         }else{
4299             mb_type -= 23;
4300             goto decode_intra_mb;
4301         }
4302     }else if(h->slice_type_nos == FF_P_TYPE){
4303         if(mb_type < 5){
4304             partition_count= p_mb_type_info[mb_type].partition_count;
4305             mb_type=         p_mb_type_info[mb_type].type;
4306         }else{
4307             mb_type -= 5;
4308             goto decode_intra_mb;
4309         }
4310     }else{
4311        assert(h->slice_type_nos == FF_I_TYPE);
4312         if(h->slice_type == FF_SI_TYPE && mb_type)
4313             mb_type--;
4314 decode_intra_mb:
4315         if(mb_type > 25){
4316             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4317             return -1;
4318         }
4319         partition_count=0;
4320         cbp= i_mb_type_info[mb_type].cbp;
4321         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4322         mb_type= i_mb_type_info[mb_type].type;
4323     }
4324
4325     if(MB_FIELD)
4326         mb_type |= MB_TYPE_INTERLACED;
4327
4328     h->slice_table[ mb_xy ]= h->slice_num;
4329
4330     if(IS_INTRA_PCM(mb_type)){
4331         unsigned int x;
4332
4333         // We assume these blocks are very rare so we do not optimize it.
4334         align_get_bits(&s->gb);
4335
4336         // The pixels are stored in the same order as levels in h->mb array.
4337         for(x=0; x < (CHROMA ? 384 : 256); x++){
4338             ((uint8_t*)h->mb)[x]= get_bits(&s->gb, 8);
4339         }
4340
4341         // In deblocking, the quantizer is 0
4342         s->current_picture.qscale_table[mb_xy]= 0;
4343         // All coeffs are present
4344         memset(h->non_zero_count[mb_xy], 16, 16);
4345
4346         s->current_picture.mb_type[mb_xy]= mb_type;
4347         return 0;
4348     }
4349
4350     if(MB_MBAFF){
4351         h->ref_count[0] <<= 1;
4352         h->ref_count[1] <<= 1;
4353     }
4354
4355     fill_caches(h, mb_type, 0);
4356
4357     //mb_pred
4358     if(IS_INTRA(mb_type)){
4359         int pred_mode;
4360 //            init_top_left_availability(h);
4361         if(IS_INTRA4x4(mb_type)){
4362             int i;
4363             int di = 1;
4364             if(dct8x8_allowed && get_bits1(&s->gb)){
4365                 mb_type |= MB_TYPE_8x8DCT;
4366                 di = 4;
4367             }
4368
4369 //                fill_intra4x4_pred_table(h);
4370             for(i=0; i<16; i+=di){
4371                 int mode= pred_intra_mode(h, i);
4372
4373                 if(!get_bits1(&s->gb)){
4374                     const int rem_mode= get_bits(&s->gb, 3);
4375                     mode = rem_mode + (rem_mode >= mode);
4376                 }
4377
4378                 if(di==4)
4379                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4380                 else
4381                     h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4382             }
4383             write_back_intra_pred_mode(h);
4384             if( check_intra4x4_pred_mode(h) < 0)
4385                 return -1;
4386         }else{
4387             h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4388             if(h->intra16x16_pred_mode < 0)
4389                 return -1;
4390         }
4391         if(CHROMA){
4392             pred_mode= check_intra_pred_mode(h, get_ue_golomb(&s->gb));
4393             if(pred_mode < 0)
4394                 return -1;
4395             h->chroma_pred_mode= pred_mode;
4396         }
4397     }else if(partition_count==4){
4398         int i, j, sub_partition_count[4], list, ref[2][4];
4399
4400         if(h->slice_type_nos == FF_B_TYPE){
4401             for(i=0; i<4; i++){
4402                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4403                 if(h->sub_mb_type[i] >=13){
4404                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4405                     return -1;
4406                 }
4407                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4408                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4409             }
4410             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4411                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4412                 pred_direct_motion(h, &mb_type);
4413                 h->ref_cache[0][scan8[4]] =
4414                 h->ref_cache[1][scan8[4]] =
4415                 h->ref_cache[0][scan8[12]] =
4416                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4417             }
4418         }else{
4419             assert(h->slice_type_nos == FF_P_TYPE); //FIXME SP correct ?
4420             for(i=0; i<4; i++){
4421                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4422                 if(h->sub_mb_type[i] >=4){
4423                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4424                     return -1;
4425                 }
4426                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4427                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4428             }
4429         }
4430
4431         for(list=0; list<h->list_count; list++){
4432             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4433             for(i=0; i<4; i++){
4434                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4435                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4436                     unsigned int tmp = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
4437                     if(tmp>=ref_count){
4438                         av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4439                         return -1;
4440                     }
4441                     ref[list][i]= tmp;
4442                 }else{
4443                  //FIXME
4444                     ref[list][i] = -1;
4445                 }
4446             }
4447         }
4448
4449         if(dct8x8_allowed)
4450             dct8x8_allowed = get_dct8x8_allowed(h);
4451
4452         for(list=0; list<h->list_count; list++){
4453             for(i=0; i<4; i++){
4454                 if(IS_DIRECT(h->sub_mb_type[i])) {
4455                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4456                     continue;
4457                 }
4458                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4459                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4460
4461                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4462                     const int sub_mb_type= h->sub_mb_type[i];
4463                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4464                     for(j=0; j<sub_partition_count[i]; j++){
4465                         int mx, my;
4466                         const int index= 4*i + block_width*j;
4467                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4468                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4469                         mx += get_se_golomb(&s->gb);
4470                         my += get_se_golomb(&s->gb);
4471                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4472
4473                         if(IS_SUB_8X8(sub_mb_type)){
4474                             mv_cache[ 1 ][0]=
4475                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4476                             mv_cache[ 1 ][1]=
4477                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4478                         }else if(IS_SUB_8X4(sub_mb_type)){
4479                             mv_cache[ 1 ][0]= mx;
4480                             mv_cache[ 1 ][1]= my;
4481                         }else if(IS_SUB_4X8(sub_mb_type)){
4482                             mv_cache[ 8 ][0]= mx;
4483                             mv_cache[ 8 ][1]= my;
4484                         }
4485                         mv_cache[ 0 ][0]= mx;
4486                         mv_cache[ 0 ][1]= my;
4487                     }
4488                 }else{
4489                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4490                     p[0] = p[1]=
4491                     p[8] = p[9]= 0;
4492                 }
4493             }
4494         }
4495     }else if(IS_DIRECT(mb_type)){
4496         pred_direct_motion(h, &mb_type);
4497         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4498     }else{
4499         int list, mx, my, i;
4500          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4501         if(IS_16X16(mb_type)){
4502             for(list=0; list<h->list_count; list++){
4503                     unsigned int val;
4504                     if(IS_DIR(mb_type, 0, list)){
4505                         val= get_te0_golomb(&s->gb, h->ref_count[list]);
4506                         if(val >= h->ref_count[list]){
4507                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4508                             return -1;
4509                         }
4510                     }else
4511                         val= LIST_NOT_USED&0xFF;
4512                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4513             }
4514             for(list=0; list<h->list_count; list++){
4515                 unsigned int val;
4516                 if(IS_DIR(mb_type, 0, list)){
4517                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4518                     mx += get_se_golomb(&s->gb);
4519                     my += get_se_golomb(&s->gb);
4520                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4521
4522                     val= pack16to32(mx,my);
4523                 }else
4524                     val=0;
4525                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4526             }
4527         }
4528         else if(IS_16X8(mb_type)){
4529             for(list=0; list<h->list_count; list++){
4530                     for(i=0; i<2; i++){
4531                         unsigned int val;
4532                         if(IS_DIR(mb_type, i, list)){
4533                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4534                             if(val >= h->ref_count[list]){
4535                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4536                                 return -1;
4537                             }
4538                         }else
4539                             val= LIST_NOT_USED&0xFF;
4540                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4541                     }
4542             }
4543             for(list=0; list<h->list_count; list++){
4544                 for(i=0; i<2; i++){
4545                     unsigned int val;
4546                     if(IS_DIR(mb_type, i, list)){
4547                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4548                         mx += get_se_golomb(&s->gb);
4549                         my += get_se_golomb(&s->gb);
4550                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4551
4552                         val= pack16to32(mx,my);
4553                     }else
4554                         val=0;
4555                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4556                 }
4557             }
4558         }else{
4559             assert(IS_8X16(mb_type));
4560             for(list=0; list<h->list_count; list++){
4561                     for(i=0; i<2; i++){
4562                         unsigned int val;
4563                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4564                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4565                             if(val >= h->ref_count[list]){
4566                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4567                                 return -1;
4568                             }
4569                         }else
4570                             val= LIST_NOT_USED&0xFF;
4571                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4572                     }
4573             }
4574             for(list=0; list<h->list_count; list++){
4575                 for(i=0; i<2; i++){
4576                     unsigned int val;
4577                     if(IS_DIR(mb_type, i, list)){
4578                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4579                         mx += get_se_golomb(&s->gb);
4580                         my += get_se_golomb(&s->gb);
4581                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4582
4583                         val= pack16to32(mx,my);
4584                     }else
4585                         val=0;
4586                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4587                 }
4588             }
4589         }
4590     }
4591
4592     if(IS_INTER(mb_type))
4593         write_back_motion(h, mb_type);
4594
4595     if(!IS_INTRA16x16(mb_type)){
4596         cbp= get_ue_golomb(&s->gb);
4597         if(cbp > 47){
4598             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4599             return -1;
4600         }
4601
4602         if(CHROMA){
4603             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp[cbp];
4604             else                     cbp= golomb_to_inter_cbp   [cbp];
4605         }else{
4606             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp_gray[cbp];
4607             else                     cbp= golomb_to_inter_cbp_gray[cbp];
4608         }
4609     }
4610     h->cbp = cbp;
4611
4612     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4613         if(get_bits1(&s->gb)){
4614             mb_type |= MB_TYPE_8x8DCT;
4615             h->cbp_table[mb_xy]= cbp;
4616         }
4617     }
4618     s->current_picture.mb_type[mb_xy]= mb_type;
4619
4620     if(cbp || IS_INTRA16x16(mb_type)){
4621         int i8x8, i4x4, chroma_idx;
4622         int dquant;
4623         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4624         const uint8_t *scan, *scan8x8, *dc_scan;
4625
4626 //        fill_non_zero_count_cache(h);
4627
4628         if(IS_INTERLACED(mb_type)){
4629             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4630             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4631             dc_scan= luma_dc_field_scan;
4632         }else{
4633             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4634             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4635             dc_scan= luma_dc_zigzag_scan;
4636         }
4637
4638         dquant= get_se_golomb(&s->gb);
4639
4640         if( dquant > 25 || dquant < -26 ){
4641             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4642             return -1;
4643         }
4644
4645         s->qscale += dquant;
4646         if(((unsigned)s->qscale) > 51){
4647             if(s->qscale<0) s->qscale+= 52;
4648             else            s->qscale-= 52;
4649         }
4650
4651         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4652         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4653         if(IS_INTRA16x16(mb_type)){
4654             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4655                 return -1; //FIXME continue if partitioned and other return -1 too
4656             }
4657
4658             assert((cbp&15) == 0 || (cbp&15) == 15);
4659
4660             if(cbp&15){
4661                 for(i8x8=0; i8x8<4; i8x8++){
4662                     for(i4x4=0; i4x4<4; i4x4++){
4663                         const int index= i4x4 + 4*i8x8;
4664                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4665                             return -1;
4666                         }
4667                     }
4668                 }
4669             }else{
4670                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4671             }
4672         }else{
4673             for(i8x8=0; i8x8<4; i8x8++){
4674                 if(cbp & (1<<i8x8)){
4675                     if(IS_8x8DCT(mb_type)){
4676                         DCTELEM *buf = &h->mb[64*i8x8];
4677                         uint8_t *nnz;
4678                         for(i4x4=0; i4x4<4; i4x4++){
4679                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4680                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4681                                 return -1;
4682                         }
4683                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4684                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4685                     }else{
4686                         for(i4x4=0; i4x4<4; i4x4++){
4687                             const int index= i4x4 + 4*i8x8;
4688
4689                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4690                                 return -1;
4691                             }
4692                         }
4693                     }
4694                 }else{
4695                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4696                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4697                 }
4698             }
4699         }
4700
4701         if(cbp&0x30){
4702             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4703                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4704                     return -1;
4705                 }
4706         }
4707
4708         if(cbp&0x20){
4709             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4710                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4711                 for(i4x4=0; i4x4<4; i4x4++){
4712                     const int index= 16 + 4*chroma_idx + i4x4;
4713                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4714                         return -1;
4715                     }
4716                 }
4717             }
4718         }else{
4719             uint8_t * const nnz= &h->non_zero_count_cache[0];
4720             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4721             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4722         }
4723     }else{
4724         uint8_t * const nnz= &h->non_zero_count_cache[0];
4725         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4726         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4727         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4728     }
4729     s->current_picture.qscale_table[mb_xy]= s->qscale;
4730     write_back_non_zero_count(h);
4731
4732     if(MB_MBAFF){
4733         h->ref_count[0] >>= 1;
4734         h->ref_count[1] >>= 1;
4735     }
4736
4737     return 0;
4738 }
4739
4740 static int decode_cabac_field_decoding_flag(H264Context *h) {
4741     MpegEncContext * const s = &h->s;
4742     const int mb_x = s->mb_x;
4743     const int mb_y = s->mb_y & ~1;
4744     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4745     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4746
4747     unsigned int ctx = 0;
4748
4749     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
4750         ctx += 1;
4751     }
4752     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
4753         ctx += 1;
4754     }
4755
4756     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
4757 }
4758
4759 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4760     uint8_t *state= &h->cabac_state[ctx_base];
4761     int mb_type;
4762
4763     if(intra_slice){
4764         MpegEncContext * const s = &h->s;
4765         const int mba_xy = h->left_mb_xy[0];
4766         const int mbb_xy = h->top_mb_xy;
4767         int ctx=0;
4768         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
4769             ctx++;
4770         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
4771             ctx++;
4772         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
4773             return 0;   /* I4x4 */
4774         state += 2;
4775     }else{
4776         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
4777             return 0;   /* I4x4 */
4778     }
4779
4780     if( get_cabac_terminate( &h->cabac ) )
4781         return 25;  /* PCM */
4782
4783     mb_type = 1; /* I16x16 */
4784     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
4785     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
4786         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
4787     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
4788     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
4789     return mb_type;
4790 }
4791
4792 static int decode_cabac_mb_type( H264Context *h ) {
4793     MpegEncContext * const s = &h->s;
4794
4795     if( h->slice_type_nos == FF_I_TYPE ) {
4796         return decode_cabac_intra_mb_type(h, 3, 1);
4797     } else if( h->slice_type_nos == FF_P_TYPE ) {
4798         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
4799             /* P-type */
4800             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
4801                 /* P_L0_D16x16, P_8x8 */
4802                 return 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
4803             } else {
4804                 /* P_L0_D8x16, P_L0_D16x8 */
4805                 return 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
4806             }
4807         } else {
4808             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
4809         }
4810     } else {
4811         const int mba_xy = h->left_mb_xy[0];
4812         const int mbb_xy = h->top_mb_xy;
4813         int ctx = 0;
4814         int bits;
4815         assert(h->slice_type_nos == FF_B_TYPE);
4816
4817         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
4818             ctx++;
4819         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
4820             ctx++;
4821
4822         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
4823             return 0; /* B_Direct_16x16 */
4824
4825         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
4826             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
4827         }
4828
4829         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
4830         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
4831         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
4832         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4833         if( bits < 8 )
4834             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
4835         else if( bits == 13 ) {
4836             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
4837         } else if( bits == 14 )
4838             return 11; /* B_L1_L0_8x16 */
4839         else if( bits == 15 )
4840             return 22; /* B_8x8 */
4841
4842         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4843         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
4844     }
4845 }
4846
4847 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
4848     MpegEncContext * const s = &h->s;
4849     int mba_xy, mbb_xy;
4850     int ctx = 0;
4851
4852     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
4853         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
4854         mba_xy = mb_xy - 1;
4855         if( (mb_y&1)
4856             && h->slice_table[mba_xy] == h->slice_num
4857             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
4858             mba_xy += s->mb_stride;
4859         if( MB_FIELD ){
4860             mbb_xy = mb_xy - s->mb_stride;
4861             if( !(mb_y&1)
4862                 && h->slice_table[mbb_xy] == h->slice_num
4863                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
4864                 mbb_xy -= s->mb_stride;
4865         }else
4866             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
4867     }else{
4868         int mb_xy = h->mb_xy;
4869         mba_xy = mb_xy - 1;
4870         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
4871     }
4872
4873     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
4874         ctx++;
4875     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
4876         ctx++;
4877
4878     if( h->slice_type_nos == FF_B_TYPE )
4879         ctx += 13;
4880     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
4881 }
4882
4883 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
4884     int mode = 0;
4885
4886     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
4887         return pred_mode;
4888
4889     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
4890     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
4891     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
4892
4893     if( mode >= pred_mode )
4894         return mode + 1;
4895     else
4896         return mode;
4897 }
4898
4899 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
4900     const int mba_xy = h->left_mb_xy[0];
4901     const int mbb_xy = h->top_mb_xy;
4902
4903     int ctx = 0;
4904
4905     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
4906     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
4907         ctx++;
4908
4909     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
4910         ctx++;
4911
4912     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
4913         return 0;
4914
4915     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4916         return 1;
4917     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4918         return 2;
4919     else
4920         return 3;
4921 }
4922
4923 static int decode_cabac_mb_cbp_luma( H264Context *h) {
4924     int cbp_b, cbp_a, ctx, cbp = 0;
4925
4926     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
4927     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
4928
4929     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
4930     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
4931     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
4932     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
4933     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
4934     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
4935     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
4936     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
4937     return cbp;
4938 }
4939 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
4940     int ctx;
4941     int cbp_a, cbp_b;
4942
4943     cbp_a = (h->left_cbp>>4)&0x03;
4944     cbp_b = (h-> top_cbp>>4)&0x03;
4945
4946     ctx = 0;
4947     if( cbp_a > 0 ) ctx++;
4948     if( cbp_b > 0 ) ctx += 2;
4949     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
4950         return 0;
4951
4952     ctx = 4;
4953     if( cbp_a == 2 ) ctx++;
4954     if( cbp_b == 2 ) ctx += 2;
4955     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
4956 }
4957 static int decode_cabac_mb_dqp( H264Context *h) {
4958     int   ctx= h->last_qscale_diff != 0;
4959     int   val = 0;
4960
4961     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
4962         ctx= 2+(ctx>>1);
4963         val++;
4964         if(val > 102) //prevent infinite loop
4965             return INT_MIN;
4966     }
4967
4968     if( val&0x01 )
4969         return   (val + 1)>>1 ;
4970     else
4971         return -((val + 1)>>1);
4972 }
4973 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
4974     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
4975         return 0;   /* 8x8 */
4976     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
4977         return 1;   /* 8x4 */
4978     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
4979         return 2;   /* 4x8 */
4980     return 3;       /* 4x4 */
4981 }
4982 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
4983     int type;
4984     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
4985         return 0;   /* B_Direct_8x8 */
4986     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
4987         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
4988     type = 3;
4989     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
4990         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
4991             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
4992         type += 4;
4993     }
4994     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
4995     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
4996     return type;
4997 }
4998
4999 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5000     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5001 }
5002
5003 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5004     int refa = h->ref_cache[list][scan8[n] - 1];
5005     int refb = h->ref_cache[list][scan8[n] - 8];
5006     int ref  = 0;
5007     int ctx  = 0;
5008
5009     if( h->slice_type_nos == FF_B_TYPE) {
5010         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5011             ctx++;
5012         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5013             ctx += 2;
5014     } else {
5015         if( refa > 0 )
5016             ctx++;
5017         if( refb > 0 )
5018             ctx += 2;
5019     }
5020
5021     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5022         ref++;
5023         if( ctx < 4 )
5024             ctx = 4;
5025         else
5026             ctx = 5;
5027         if(ref >= 32 /*h->ref_list[list]*/){
5028             return -1;
5029         }
5030     }
5031     return ref;
5032 }
5033
5034 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5035     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5036                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5037     int ctxbase = (l == 0) ? 40 : 47;
5038     int mvd;
5039     int ctx = (amvd>2) + (amvd>32);
5040
5041     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5042         return 0;
5043
5044     mvd= 1;
5045     ctx= 3;
5046     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5047         mvd++;
5048         if( ctx < 6 )
5049             ctx++;
5050     }
5051
5052     if( mvd >= 9 ) {
5053         int k = 3;
5054         while( get_cabac_bypass( &h->cabac ) ) {
5055             mvd += 1 << k;
5056             k++;
5057             if(k>24){
5058                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5059                 return INT_MIN;
5060             }
5061         }
5062         while( k-- ) {
5063             if( get_cabac_bypass( &h->cabac ) )
5064                 mvd += 1 << k;
5065         }
5066     }
5067     return get_cabac_bypass_sign( &h->cabac, -mvd );
5068 }
5069
5070 static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
5071     int nza, nzb;
5072     int ctx = 0;
5073
5074     if( is_dc ) {
5075         if( cat == 0 ) {
5076             nza = h->left_cbp&0x100;
5077             nzb = h-> top_cbp&0x100;
5078         } else {
5079             nza = (h->left_cbp>>(6+idx))&0x01;
5080             nzb = (h-> top_cbp>>(6+idx))&0x01;
5081         }
5082     } else {
5083         assert(cat == 1 || cat == 2 || cat == 4);
5084         nza = h->non_zero_count_cache[scan8[idx] - 1];
5085         nzb = h->non_zero_count_cache[scan8[idx] - 8];
5086     }
5087
5088     if( nza > 0 )
5089         ctx++;
5090
5091     if( nzb > 0 )
5092         ctx += 2;
5093
5094     return ctx + 4 * cat;
5095 }
5096
5097 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8[63]) = {
5098     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5099     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5100     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5101     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5102 };
5103
5104 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
5105     static const int significant_coeff_flag_offset[2][6] = {
5106       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5107       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5108     };
5109     static const int last_coeff_flag_offset[2][6] = {
5110       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5111       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5112     };
5113     static const int coeff_abs_level_m1_offset[6] = {
5114         227+0, 227+10, 227+20, 227+30, 227+39, 426
5115     };
5116     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5117       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5118         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5119         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5120        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5121       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5122         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5123         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5124         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5125     };
5126     /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
5127      * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
5128      * map node ctx => cabac ctx for level=1 */
5129     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
5130     /* map node ctx => cabac ctx for level>1 */
5131     static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
5132     static const uint8_t coeff_abs_level_transition[2][8] = {
5133     /* update node ctx after decoding a level=1 */
5134         { 1, 2, 3, 3, 4, 5, 6, 7 },
5135     /* update node ctx after decoding a level>1 */
5136         { 4, 4, 4, 4, 5, 6, 7, 7 }
5137     };
5138
5139     int index[64];
5140
5141     int av_unused last;
5142     int coeff_count = 0;
5143     int node_ctx = 0;
5144
5145     uint8_t *significant_coeff_ctx_base;
5146     uint8_t *last_coeff_ctx_base;
5147     uint8_t *abs_level_m1_ctx_base;
5148
5149 #ifndef ARCH_X86
5150 #define CABAC_ON_STACK
5151 #endif
5152 #ifdef CABAC_ON_STACK
5153 #define CC &cc
5154     CABACContext cc;
5155     cc.range     = h->cabac.range;
5156     cc.low       = h->cabac.low;
5157     cc.bytestream= h->cabac.bytestream;
5158 #else
5159 #define CC &h->cabac
5160 #endif
5161
5162
5163     /* cat: 0-> DC 16x16  n = 0
5164      *      1-> AC 16x16  n = luma4x4idx
5165      *      2-> Luma4x4   n = luma4x4idx
5166      *      3-> DC Chroma n = iCbCr
5167      *      4-> AC Chroma n = 16 + 4 * iCbCr + chroma4x4idx
5168      *      5-> Luma8x8   n = 4 * luma8x8idx
5169      */
5170
5171     /* read coded block flag */
5172     if( is_dc || cat != 5 ) {
5173         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
5174             if( !is_dc )
5175                 h->non_zero_count_cache[scan8[n]] = 0;
5176
5177 #ifdef CABAC_ON_STACK
5178             h->cabac.range     = cc.range     ;
5179             h->cabac.low       = cc.low       ;
5180             h->cabac.bytestream= cc.bytestream;
5181 #endif
5182             return;
5183         }
5184     }
5185
5186     significant_coeff_ctx_base = h->cabac_state
5187         + significant_coeff_flag_offset[MB_FIELD][cat];
5188     last_coeff_ctx_base = h->cabac_state
5189         + last_coeff_flag_offset[MB_FIELD][cat];
5190     abs_level_m1_ctx_base = h->cabac_state
5191         + coeff_abs_level_m1_offset[cat];
5192
5193     if( !is_dc && cat == 5 ) {
5194 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5195         for(last= 0; last < coefs; last++) { \
5196             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5197             if( get_cabac( CC, sig_ctx )) { \
5198                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5199                 index[coeff_count++] = last; \
5200                 if( get_cabac( CC, last_ctx ) ) { \
5201                     last= max_coeff; \
5202                     break; \
5203                 } \
5204             } \
5205         }\
5206         if( last == max_coeff -1 ) {\
5207             index[coeff_count++] = last;\
5208         }
5209         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5210 #if defined(ARCH_X86) && defined(HAVE_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
5211         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5212     } else {
5213         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5214 #else
5215         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5216     } else {
5217         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5218 #endif
5219     }
5220     assert(coeff_count > 0);
5221
5222     if( is_dc ) {
5223         if( cat == 0 )
5224             h->cbp_table[h->mb_xy] |= 0x100;
5225         else
5226             h->cbp_table[h->mb_xy] |= 0x40 << n;
5227     } else {
5228         if( cat == 5 )
5229             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5230         else {
5231             assert( cat == 1 || cat == 2 || cat == 4 );
5232             h->non_zero_count_cache[scan8[n]] = coeff_count;
5233         }
5234     }
5235
5236     do {
5237         uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
5238
5239         int j= scantable[index[--coeff_count]];
5240
5241         if( get_cabac( CC, ctx ) == 0 ) {
5242             node_ctx = coeff_abs_level_transition[0][node_ctx];
5243             if( is_dc ) {
5244                 block[j] = get_cabac_bypass_sign( CC, -1);
5245             }else{
5246                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5247             }
5248         } else {
5249             int coeff_abs = 2;
5250             ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
5251             node_ctx = coeff_abs_level_transition[1][node_ctx];
5252
5253             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5254                 coeff_abs++;
5255             }
5256
5257             if( coeff_abs >= 15 ) {
5258                 int j = 0;
5259                 while( get_cabac_bypass( CC ) ) {
5260                     j++;
5261                 }
5262
5263                 coeff_abs=1;
5264                 while( j-- ) {
5265                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5266                 }
5267                 coeff_abs+= 14;
5268             }
5269
5270             if( is_dc ) {
5271                 block[j] = get_cabac_bypass_sign( CC, -coeff_abs );
5272             }else{
5273                 block[j] = (get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32) >> 6;
5274             }
5275         }
5276     } while( coeff_count );
5277 #ifdef CABAC_ON_STACK
5278             h->cabac.range     = cc.range     ;
5279             h->cabac.low       = cc.low       ;
5280             h->cabac.bytestream= cc.bytestream;
5281 #endif
5282
5283 }
5284
5285 #ifndef CONFIG_SMALL
5286 static void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5287     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 1);
5288 }
5289
5290 static void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5291     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
5292 }
5293 #endif
5294
5295 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5296 #ifdef CONFIG_SMALL
5297     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, cat == 0 || cat == 3);
5298 #else
5299     if( cat == 0 || cat == 3 ) decode_cabac_residual_dc(h, block, cat, n, scantable, qmul, max_coeff);
5300     else decode_cabac_residual_nondc(h, block, cat, n, scantable, qmul, max_coeff);
5301 #endif
5302 }
5303
5304 static inline void compute_mb_neighbors(H264Context *h)
5305 {
5306     MpegEncContext * const s = &h->s;
5307     const int mb_xy  = h->mb_xy;
5308     h->top_mb_xy     = mb_xy - s->mb_stride;
5309     h->left_mb_xy[0] = mb_xy - 1;
5310     if(FRAME_MBAFF){
5311         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5312         const int top_pair_xy      = pair_xy     - s->mb_stride;
5313         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5314         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5315         const int curr_mb_frame_flag = !MB_FIELD;
5316         const int bottom = (s->mb_y & 1);
5317         if (bottom
5318                 ? !curr_mb_frame_flag // bottom macroblock
5319                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
5320                 ) {
5321             h->top_mb_xy -= s->mb_stride;
5322         }
5323         if (left_mb_frame_flag != curr_mb_frame_flag) {
5324             h->left_mb_xy[0] = pair_xy - 1;
5325         }
5326     } else if (FIELD_PICTURE) {
5327         h->top_mb_xy -= s->mb_stride;
5328     }
5329     return;
5330 }
5331
5332 /**
5333  * decodes a macroblock
5334  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5335  */
5336 static int decode_mb_cabac(H264Context *h) {
5337     MpegEncContext * const s = &h->s;
5338     int mb_xy;
5339     int mb_type, partition_count, cbp = 0;
5340     int dct8x8_allowed= h->pps.transform_8x8_mode;
5341
5342     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5343
5344     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5345     if( h->slice_type_nos != FF_I_TYPE ) {
5346         int skip;
5347         /* a skipped mb needs the aff flag from the following mb */
5348         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5349             predict_field_decoding_flag(h);
5350         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5351             skip = h->next_mb_skipped;
5352         else
5353             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5354         /* read skip flags */
5355         if( skip ) {
5356             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5357                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5358                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5359                 if(!h->next_mb_skipped)
5360                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5361             }
5362
5363             decode_mb_skip(h);
5364
5365             h->cbp_table[mb_xy] = 0;
5366             h->chroma_pred_mode_table[mb_xy] = 0;
5367             h->last_qscale_diff = 0;
5368
5369             return 0;
5370
5371         }
5372     }
5373     if(FRAME_MBAFF){
5374         if( (s->mb_y&1) == 0 )
5375             h->mb_mbaff =
5376             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5377     }
5378
5379     h->prev_mb_skipped = 0;
5380
5381     compute_mb_neighbors(h);
5382     mb_type = decode_cabac_mb_type( h );
5383     assert(mb_type >= 0);
5384
5385     if( h->slice_type_nos == FF_B_TYPE ) {
5386         if( mb_type < 23 ){
5387             partition_count= b_mb_type_info[mb_type].partition_count;
5388             mb_type=         b_mb_type_info[mb_type].type;
5389         }else{
5390             mb_type -= 23;
5391             goto decode_intra_mb;
5392         }
5393     } else if( h->slice_type_nos == FF_P_TYPE ) {
5394         if( mb_type < 5) {
5395             partition_count= p_mb_type_info[mb_type].partition_count;
5396             mb_type=         p_mb_type_info[mb_type].type;
5397         } else {
5398             mb_type -= 5;
5399             goto decode_intra_mb;
5400         }
5401     } else {
5402         if(h->slice_type == FF_SI_TYPE && mb_type)
5403             mb_type--;
5404         assert(h->slice_type_nos == FF_I_TYPE);
5405 decode_intra_mb:
5406         partition_count = 0;
5407         cbp= i_mb_type_info[mb_type].cbp;
5408         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5409         mb_type= i_mb_type_info[mb_type].type;
5410     }
5411     if(MB_FIELD)
5412         mb_type |= MB_TYPE_INTERLACED;
5413
5414     h->slice_table[ mb_xy ]= h->slice_num;
5415
5416     if(IS_INTRA_PCM(mb_type)) {
5417         const uint8_t *ptr;
5418
5419         // We assume these blocks are very rare so we do not optimize it.
5420         // FIXME The two following lines get the bitstream position in the cabac
5421         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5422         ptr= h->cabac.bytestream;
5423         if(h->cabac.low&0x1) ptr--;
5424         if(CABAC_BITS==16){
5425             if(h->cabac.low&0x1FF) ptr--;
5426         }
5427
5428         // The pixels are stored in the same order as levels in h->mb array.
5429         memcpy(h->mb, ptr, 256); ptr+=256;
5430         if(CHROMA){
5431             memcpy(h->mb+128, ptr, 128); ptr+=128;
5432         }
5433
5434         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5435
5436         // All blocks are present
5437         h->cbp_table[mb_xy] = 0x1ef;
5438         h->chroma_pred_mode_table[mb_xy] = 0;
5439         // In deblocking, the quantizer is 0
5440         s->current_picture.qscale_table[mb_xy]= 0;
5441         // All coeffs are present
5442         memset(h->non_zero_count[mb_xy], 16, 16);
5443         s->current_picture.mb_type[mb_xy]= mb_type;
5444         h->last_qscale_diff = 0;
5445         return 0;
5446     }
5447
5448     if(MB_MBAFF){
5449         h->ref_count[0] <<= 1;
5450         h->ref_count[1] <<= 1;
5451     }
5452
5453     fill_caches(h, mb_type, 0);
5454
5455     if( IS_INTRA( mb_type ) ) {
5456         int i, pred_mode;
5457         if( IS_INTRA4x4( mb_type ) ) {
5458             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5459                 mb_type |= MB_TYPE_8x8DCT;
5460                 for( i = 0; i < 16; i+=4 ) {
5461                     int pred = pred_intra_mode( h, i );
5462                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5463                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5464                 }
5465             } else {
5466                 for( i = 0; i < 16; i++ ) {
5467                     int pred = pred_intra_mode( h, i );
5468                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5469
5470                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5471                 }
5472             }
5473             write_back_intra_pred_mode(h);
5474             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5475         } else {
5476             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5477             if( h->intra16x16_pred_mode < 0 ) return -1;
5478         }
5479         if(CHROMA){
5480             h->chroma_pred_mode_table[mb_xy] =
5481             pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5482
5483             pred_mode= check_intra_pred_mode( h, pred_mode );
5484             if( pred_mode < 0 ) return -1;
5485             h->chroma_pred_mode= pred_mode;
5486         }
5487     } else if( partition_count == 4 ) {
5488         int i, j, sub_partition_count[4], list, ref[2][4];
5489
5490         if( h->slice_type_nos == FF_B_TYPE ) {
5491             for( i = 0; i < 4; i++ ) {
5492                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5493                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5494                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5495             }
5496             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5497                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5498                 pred_direct_motion(h, &mb_type);
5499                 h->ref_cache[0][scan8[4]] =
5500                 h->ref_cache[1][scan8[4]] =
5501                 h->ref_cache[0][scan8[12]] =
5502                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5503                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5504                     for( i = 0; i < 4; i++ )
5505                         if( IS_DIRECT(h->sub_mb_type[i]) )
5506                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5507                 }
5508             }
5509         } else {
5510             for( i = 0; i < 4; i++ ) {
5511                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5512                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5513                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5514             }
5515         }
5516
5517         for( list = 0; list < h->list_count; list++ ) {
5518                 for( i = 0; i < 4; i++ ) {
5519                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5520                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5521                         if( h->ref_count[list] > 1 ){
5522                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5523                             if(ref[list][i] >= (unsigned)h->ref_count[list]){
5524                                 av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref[list][i], h->ref_count[list]);
5525                                 return -1;
5526                             }
5527                         }else
5528                             ref[list][i] = 0;
5529                     } else {
5530                         ref[list][i] = -1;
5531                     }
5532                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5533                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5534                 }
5535         }
5536
5537         if(dct8x8_allowed)
5538             dct8x8_allowed = get_dct8x8_allowed(h);
5539
5540         for(list=0; list<h->list_count; list++){
5541             for(i=0; i<4; i++){
5542                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5543                 if(IS_DIRECT(h->sub_mb_type[i])){
5544                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5545                     continue;
5546                 }
5547
5548                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5549                     const int sub_mb_type= h->sub_mb_type[i];
5550                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5551                     for(j=0; j<sub_partition_count[i]; j++){
5552                         int mpx, mpy;
5553                         int mx, my;
5554                         const int index= 4*i + block_width*j;
5555                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5556                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5557                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5558
5559                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5560                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5561                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5562
5563                         if(IS_SUB_8X8(sub_mb_type)){
5564                             mv_cache[ 1 ][0]=
5565                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5566                             mv_cache[ 1 ][1]=
5567                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5568
5569                             mvd_cache[ 1 ][0]=
5570                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5571                             mvd_cache[ 1 ][1]=
5572                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5573                         }else if(IS_SUB_8X4(sub_mb_type)){
5574                             mv_cache[ 1 ][0]= mx;
5575                             mv_cache[ 1 ][1]= my;
5576
5577                             mvd_cache[ 1 ][0]= mx - mpx;
5578                             mvd_cache[ 1 ][1]= my - mpy;
5579                         }else if(IS_SUB_4X8(sub_mb_type)){
5580                             mv_cache[ 8 ][0]= mx;
5581                             mv_cache[ 8 ][1]= my;
5582
5583                             mvd_cache[ 8 ][0]= mx - mpx;
5584                             mvd_cache[ 8 ][1]= my - mpy;
5585                         }
5586                         mv_cache[ 0 ][0]= mx;
5587                         mv_cache[ 0 ][1]= my;
5588
5589                         mvd_cache[ 0 ][0]= mx - mpx;
5590                         mvd_cache[ 0 ][1]= my - mpy;
5591                     }
5592                 }else{
5593                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5594                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5595                     p[0] = p[1] = p[8] = p[9] = 0;
5596                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5597                 }
5598             }
5599         }
5600     } else if( IS_DIRECT(mb_type) ) {
5601         pred_direct_motion(h, &mb_type);
5602         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5603         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5604         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5605     } else {
5606         int list, mx, my, i, mpx, mpy;
5607         if(IS_16X16(mb_type)){
5608             for(list=0; list<h->list_count; list++){
5609                 if(IS_DIR(mb_type, 0, list)){
5610                     int ref;
5611                     if(h->ref_count[list] > 1){
5612                         ref= decode_cabac_mb_ref(h, list, 0);
5613                         if(ref >= (unsigned)h->ref_count[list]){
5614                             av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5615                             return -1;
5616                         }
5617                     }else
5618                         ref=0;
5619                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5620                 }else
5621                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5622             }
5623             for(list=0; list<h->list_count; list++){
5624                 if(IS_DIR(mb_type, 0, list)){
5625                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5626
5627                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5628                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5629                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5630
5631                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5632                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5633                 }else
5634                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5635             }
5636         }
5637         else if(IS_16X8(mb_type)){
5638             for(list=0; list<h->list_count; list++){
5639                     for(i=0; i<2; i++){
5640                         if(IS_DIR(mb_type, i, list)){
5641                             int ref;
5642                             if(h->ref_count[list] > 1){
5643                                 ref= decode_cabac_mb_ref( h, list, 8*i );
5644                                 if(ref >= (unsigned)h->ref_count[list]){
5645                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5646                                     return -1;
5647                                 }
5648                             }else
5649                                 ref=0;
5650                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5651                         }else
5652                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5653                     }
5654             }
5655             for(list=0; list<h->list_count; list++){
5656                 for(i=0; i<2; i++){
5657                     if(IS_DIR(mb_type, i, list)){
5658                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5659                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5660                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5661                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5662
5663                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5664                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5665                     }else{
5666                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5667                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5668                     }
5669                 }
5670             }
5671         }else{
5672             assert(IS_8X16(mb_type));
5673             for(list=0; list<h->list_count; list++){
5674                     for(i=0; i<2; i++){
5675                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5676                             int ref;
5677                             if(h->ref_count[list] > 1){
5678                                 ref= decode_cabac_mb_ref( h, list, 4*i );
5679                                 if(ref >= (unsigned)h->ref_count[list]){
5680                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5681                                     return -1;
5682                                 }
5683                             }else
5684                                 ref=0;
5685                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5686                         }else
5687                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5688                     }
5689             }
5690             for(list=0; list<h->list_count; list++){
5691                 for(i=0; i<2; i++){
5692                     if(IS_DIR(mb_type, i, list)){
5693                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5694                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5695                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5696
5697                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5698                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5699                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5700                     }else{
5701                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5702                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5703                     }
5704                 }
5705             }
5706         }
5707     }
5708
5709    if( IS_INTER( mb_type ) ) {
5710         h->chroma_pred_mode_table[mb_xy] = 0;
5711         write_back_motion( h, mb_type );
5712    }
5713
5714     if( !IS_INTRA16x16( mb_type ) ) {
5715         cbp  = decode_cabac_mb_cbp_luma( h );
5716         if(CHROMA)
5717             cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5718     }
5719
5720     h->cbp_table[mb_xy] = h->cbp = cbp;
5721
5722     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5723         if( decode_cabac_mb_transform_size( h ) )
5724             mb_type |= MB_TYPE_8x8DCT;
5725     }
5726     s->current_picture.mb_type[mb_xy]= mb_type;
5727
5728     if( cbp || IS_INTRA16x16( mb_type ) ) {
5729         const uint8_t *scan, *scan8x8, *dc_scan;
5730         const uint32_t *qmul;
5731         int dqp;
5732
5733         if(IS_INTERLACED(mb_type)){
5734             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5735             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5736             dc_scan= luma_dc_field_scan;
5737         }else{
5738             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5739             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5740             dc_scan= luma_dc_zigzag_scan;
5741         }
5742
5743         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5744         if( dqp == INT_MIN ){
5745             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5746             return -1;
5747         }
5748         s->qscale += dqp;
5749         if(((unsigned)s->qscale) > 51){
5750             if(s->qscale<0) s->qscale+= 52;
5751             else            s->qscale-= 52;
5752         }
5753         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5754         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5755
5756         if( IS_INTRA16x16( mb_type ) ) {
5757             int i;
5758             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5759             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5760
5761             if( cbp&15 ) {
5762                 qmul = h->dequant4_coeff[0][s->qscale];
5763                 for( i = 0; i < 16; i++ ) {
5764                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5765                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
5766                 }
5767             } else {
5768                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5769             }
5770         } else {
5771             int i8x8, i4x4;
5772             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5773                 if( cbp & (1<<i8x8) ) {
5774                     if( IS_8x8DCT(mb_type) ) {
5775                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
5776                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
5777                     } else {
5778                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
5779                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
5780                             const int index = 4*i8x8 + i4x4;
5781                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
5782 //START_TIMER
5783                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
5784 //STOP_TIMER("decode_residual")
5785                         }
5786                     }
5787                 } else {
5788                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5789                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5790                 }
5791             }
5792         }
5793
5794         if( cbp&0x30 ){
5795             int c;
5796             for( c = 0; c < 2; c++ ) {
5797                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
5798                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
5799             }
5800         }
5801
5802         if( cbp&0x20 ) {
5803             int c, i;
5804             for( c = 0; c < 2; c++ ) {
5805                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
5806                 for( i = 0; i < 4; i++ ) {
5807                     const int index = 16 + 4 * c + i;
5808                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
5809                     decode_cabac_residual(h, h->mb + 16*index, 4, index, scan + 1, qmul, 15);
5810                 }
5811             }
5812         } else {
5813             uint8_t * const nnz= &h->non_zero_count_cache[0];
5814             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5815             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5816         }
5817     } else {
5818         uint8_t * const nnz= &h->non_zero_count_cache[0];
5819         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5820         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5821         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5822         h->last_qscale_diff = 0;
5823     }
5824
5825     s->current_picture.qscale_table[mb_xy]= s->qscale;
5826     write_back_non_zero_count(h);
5827
5828     if(MB_MBAFF){
5829         h->ref_count[0] >>= 1;
5830         h->ref_count[1] >>= 1;
5831     }
5832
5833     return 0;
5834 }
5835
5836
5837 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5838     const int index_a = qp + h->slice_alpha_c0_offset;
5839     const int alpha = (alpha_table+52)[index_a];
5840     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5841
5842     if( bS[0] < 4 ) {
5843         int8_t tc[4];
5844         tc[0] = (tc0_table+52)[index_a][bS[0]];
5845         tc[1] = (tc0_table+52)[index_a][bS[1]];
5846         tc[2] = (tc0_table+52)[index_a][bS[2]];
5847         tc[3] = (tc0_table+52)[index_a][bS[3]];
5848         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
5849     } else {
5850         h->s.dsp.h264_h_loop_filter_luma_intra(pix, stride, alpha, beta);
5851     }
5852 }
5853 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5854     const int index_a = qp + h->slice_alpha_c0_offset;
5855     const int alpha = (alpha_table+52)[index_a];
5856     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5857
5858     if( bS[0] < 4 ) {
5859         int8_t tc[4];
5860         tc[0] = (tc0_table+52)[index_a][bS[0]]+1;
5861         tc[1] = (tc0_table+52)[index_a][bS[1]]+1;
5862         tc[2] = (tc0_table+52)[index_a][bS[2]]+1;
5863         tc[3] = (tc0_table+52)[index_a][bS[3]]+1;
5864         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
5865     } else {
5866         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
5867     }
5868 }
5869
5870 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
5871     int i;
5872     for( i = 0; i < 16; i++, pix += stride) {
5873         int index_a;
5874         int alpha;
5875         int beta;
5876
5877         int qp_index;
5878         int bS_index = (i >> 1);
5879         if (!MB_FIELD) {
5880             bS_index &= ~1;
5881             bS_index |= (i & 1);
5882         }
5883
5884         if( bS[bS_index] == 0 ) {
5885             continue;
5886         }
5887
5888         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
5889         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
5890         alpha = (alpha_table+52)[index_a];
5891         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
5892
5893         if( bS[bS_index] < 4 ) {
5894             const int tc0 = (tc0_table+52)[index_a][bS[bS_index]];
5895             const int p0 = pix[-1];
5896             const int p1 = pix[-2];
5897             const int p2 = pix[-3];
5898             const int q0 = pix[0];
5899             const int q1 = pix[1];
5900             const int q2 = pix[2];
5901
5902             if( FFABS( p0 - q0 ) < alpha &&
5903                 FFABS( p1 - p0 ) < beta &&
5904                 FFABS( q1 - q0 ) < beta ) {
5905                 int tc = tc0;
5906                 int i_delta;
5907
5908                 if( FFABS( p2 - p0 ) < beta ) {
5909                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
5910                     tc++;
5911                 }
5912                 if( FFABS( q2 - q0 ) < beta ) {
5913                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
5914                     tc++;
5915                 }
5916
5917                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
5918                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
5919                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
5920                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
5921             }
5922         }else{
5923             const int p0 = pix[-1];
5924             const int p1 = pix[-2];
5925             const int p2 = pix[-3];
5926
5927             const int q0 = pix[0];
5928             const int q1 = pix[1];
5929             const int q2 = pix[2];
5930
5931             if( FFABS( p0 - q0 ) < alpha &&
5932                 FFABS( p1 - p0 ) < beta &&
5933                 FFABS( q1 - q0 ) < beta ) {
5934
5935                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
5936                     if( FFABS( p2 - p0 ) < beta)
5937                     {
5938                         const int p3 = pix[-4];
5939                         /* p0', p1', p2' */
5940                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
5941                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
5942                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
5943                     } else {
5944                         /* p0' */
5945                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5946                     }
5947                     if( FFABS( q2 - q0 ) < beta)
5948                     {
5949                         const int q3 = pix[3];
5950                         /* q0', q1', q2' */
5951                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
5952                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
5953                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
5954                     } else {
5955                         /* q0' */
5956                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5957                     }
5958                 }else{
5959                     /* p0', q0' */
5960                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5961                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5962                 }
5963                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
5964             }
5965         }
5966     }
5967 }
5968 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
5969     int i;
5970     for( i = 0; i < 8; i++, pix += stride) {
5971         int index_a;
5972         int alpha;
5973         int beta;
5974
5975         int qp_index;
5976         int bS_index = i;
5977
5978         if( bS[bS_index] == 0 ) {
5979             continue;
5980         }
5981
5982         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
5983         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
5984         alpha = (alpha_table+52)[index_a];
5985         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
5986
5987         if( bS[bS_index] < 4 ) {
5988             const int tc = (tc0_table+52)[index_a][bS[bS_index]] + 1;
5989             const int p0 = pix[-1];
5990             const int p1 = pix[-2];
5991             const int q0 = pix[0];
5992             const int q1 = pix[1];
5993
5994             if( FFABS( p0 - q0 ) < alpha &&
5995                 FFABS( p1 - p0 ) < beta &&
5996                 FFABS( q1 - q0 ) < beta ) {
5997                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
5998
5999                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6000                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6001                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6002             }
6003         }else{
6004             const int p0 = pix[-1];
6005             const int p1 = pix[-2];
6006             const int q0 = pix[0];
6007             const int q1 = pix[1];
6008
6009             if( FFABS( p0 - q0 ) < alpha &&
6010                 FFABS( p1 - p0 ) < beta &&
6011                 FFABS( q1 - q0 ) < beta ) {
6012
6013                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6014                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6015                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6016             }
6017         }
6018     }
6019 }
6020
6021 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6022     const int index_a = qp + h->slice_alpha_c0_offset;
6023     const int alpha = (alpha_table+52)[index_a];
6024     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6025
6026     if( bS[0] < 4 ) {
6027         int8_t tc[4];
6028         tc[0] = (tc0_table+52)[index_a][bS[0]];
6029         tc[1] = (tc0_table+52)[index_a][bS[1]];
6030         tc[2] = (tc0_table+52)[index_a][bS[2]];
6031         tc[3] = (tc0_table+52)[index_a][bS[3]];
6032         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6033     } else {
6034         h->s.dsp.h264_v_loop_filter_luma_intra(pix, stride, alpha, beta);
6035     }
6036 }
6037
6038 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6039     const int index_a = qp + h->slice_alpha_c0_offset;
6040     const int alpha = (alpha_table+52)[index_a];
6041     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6042
6043     if( bS[0] < 4 ) {
6044         int8_t tc[4];
6045         tc[0] = (tc0_table+52)[index_a][bS[0]]+1;
6046         tc[1] = (tc0_table+52)[index_a][bS[1]]+1;
6047         tc[2] = (tc0_table+52)[index_a][bS[2]]+1;
6048         tc[3] = (tc0_table+52)[index_a][bS[3]]+1;
6049         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6050     } else {
6051         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6052     }
6053 }
6054
6055 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6056     MpegEncContext * const s = &h->s;
6057     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6058     int mb_xy, mb_type;
6059     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6060
6061     mb_xy = h->mb_xy;
6062
6063     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6064         !(s->flags2 & CODEC_FLAG2_FAST) || //FIXME filter_mb_fast is broken, thus hasto be, but should not under CODEC_FLAG2_FAST
6065        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6066                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6067         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6068         return;
6069     }
6070     assert(!FRAME_MBAFF);
6071
6072     mb_type = s->current_picture.mb_type[mb_xy];
6073     qp = s->current_picture.qscale_table[mb_xy];
6074     qp0 = s->current_picture.qscale_table[mb_xy-1];
6075     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6076     qpc = get_chroma_qp( h, 0, qp );
6077     qpc0 = get_chroma_qp( h, 0, qp0 );
6078     qpc1 = get_chroma_qp( h, 0, qp1 );
6079     qp0 = (qp + qp0 + 1) >> 1;
6080     qp1 = (qp + qp1 + 1) >> 1;
6081     qpc0 = (qpc + qpc0 + 1) >> 1;
6082     qpc1 = (qpc + qpc1 + 1) >> 1;
6083     qp_thresh = 15 - h->slice_alpha_c0_offset;
6084     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6085        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6086         return;
6087
6088     if( IS_INTRA(mb_type) ) {
6089         int16_t bS4[4] = {4,4,4,4};
6090         int16_t bS3[4] = {3,3,3,3};
6091         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6092         if( IS_8x8DCT(mb_type) ) {
6093             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6094             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6095             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6096             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6097         } else {
6098             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6099             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6100             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6101             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6102             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6103             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6104             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6105             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6106         }
6107         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6108         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6109         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6110         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6111         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6112         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6113         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6114         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6115         return;
6116     } else {
6117         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6118         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6119         int edges;
6120         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6121             edges = 4;
6122             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6123         } else {
6124             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6125                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6126             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6127                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6128                              ? 3 : 0;
6129             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6130             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6131             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6132                                               (h->slice_type_nos == FF_B_TYPE), edges, step, mask_edge0, mask_edge1, FIELD_PICTURE);
6133         }
6134         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6135             bSv[0][0] = 0x0004000400040004ULL;
6136         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6137             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6138
6139 #define FILTER(hv,dir,edge)\
6140         if(bSv[dir][edge]) {\
6141             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6142             if(!(edge&1)) {\
6143                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6144                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6145             }\
6146         }
6147         if( edges == 1 ) {
6148             FILTER(v,0,0);
6149             FILTER(h,1,0);
6150         } else if( IS_8x8DCT(mb_type) ) {
6151             FILTER(v,0,0);
6152             FILTER(v,0,2);
6153             FILTER(h,1,0);
6154             FILTER(h,1,2);
6155         } else {
6156             FILTER(v,0,0);
6157             FILTER(v,0,1);
6158             FILTER(v,0,2);
6159             FILTER(v,0,3);
6160             FILTER(h,1,0);
6161             FILTER(h,1,1);
6162             FILTER(h,1,2);
6163             FILTER(h,1,3);
6164         }
6165 #undef FILTER
6166     }
6167 }
6168
6169
6170 static void av_always_inline filter_mb_dir(H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int mb_xy, int mb_type, int mvy_limit, int first_vertical_edge_done, int dir) {
6171     MpegEncContext * const s = &h->s;
6172     int edge;
6173     const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6174     const int mbm_type = s->current_picture.mb_type[mbm_xy];
6175     int (*ref2frm) [64] = h->ref2frm[ h->slice_num          &(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6176     int (*ref2frmm)[64] = h->ref2frm[ h->slice_table[mbm_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6177     int start = h->slice_table[mbm_xy] == 0xFFFF ? 1 : 0;
6178
6179     const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6180                               == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6181     // how often to recheck mv-based bS when iterating between edges
6182     const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6183                           (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6184     // how often to recheck mv-based bS when iterating along each edge
6185     const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6186
6187     if (first_vertical_edge_done) {
6188         start = 1;
6189     }
6190
6191     if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6192         start = 1;
6193
6194     if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6195         && !IS_INTERLACED(mb_type)
6196         && IS_INTERLACED(mbm_type)
6197         ) {
6198         // This is a special case in the norm where the filtering must
6199         // be done twice (one each of the field) even if we are in a
6200         // frame macroblock.
6201         //
6202         static const int nnz_idx[4] = {4,5,6,3};
6203         unsigned int tmp_linesize   = 2 *   linesize;
6204         unsigned int tmp_uvlinesize = 2 * uvlinesize;
6205         int mbn_xy = mb_xy - 2 * s->mb_stride;
6206         int qp;
6207         int i, j;
6208         int16_t bS[4];
6209
6210         for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6211             if( IS_INTRA(mb_type) ||
6212                 IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6213                 bS[0] = bS[1] = bS[2] = bS[3] = 3;
6214             } else {
6215                 const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6216                 for( i = 0; i < 4; i++ ) {
6217                     if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6218                         mbn_nnz[nnz_idx[i]] != 0 )
6219                         bS[i] = 2;
6220                     else
6221                         bS[i] = 1;
6222                 }
6223             }
6224             // Do not use s->qscale as luma quantizer because it has not the same
6225             // value in IPCM macroblocks.
6226             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6227             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6228             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6229             filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6230             filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6231                               ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6232             filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6233                               ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6234         }
6235
6236         start = 1;
6237     }
6238
6239     /* Calculate bS */
6240     for( edge = start; edge < edges; edge++ ) {
6241         /* mbn_xy: neighbor macroblock */
6242         const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6243         const int mbn_type = s->current_picture.mb_type[mbn_xy];
6244         int (*ref2frmn)[64] = edge > 0 ? ref2frm : ref2frmm;
6245         int16_t bS[4];
6246         int qp;
6247
6248         if( (edge&1) && IS_8x8DCT(mb_type) )
6249             continue;
6250
6251         if( IS_INTRA(mb_type) ||
6252             IS_INTRA(mbn_type) ) {
6253             int value;
6254             if (edge == 0) {
6255                 if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6256                     || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6257                 ) {
6258                     value = 4;
6259                 } else {
6260                     value = 3;
6261                 }
6262             } else {
6263                 value = 3;
6264             }
6265             bS[0] = bS[1] = bS[2] = bS[3] = value;
6266         } else {
6267             int i, l;
6268             int mv_done;
6269
6270             if( edge & mask_edge ) {
6271                 bS[0] = bS[1] = bS[2] = bS[3] = 0;
6272                 mv_done = 1;
6273             }
6274             else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6275                 bS[0] = bS[1] = bS[2] = bS[3] = 1;
6276                 mv_done = 1;
6277             }
6278             else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6279                 int b_idx= 8 + 4 + edge * (dir ? 8:1);
6280                 int bn_idx= b_idx - (dir ? 8:1);
6281                 int v = 0;
6282
6283                 for( l = 0; !v && l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6284                     v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6285                          FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6286                          FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6287                 }
6288
6289                 if(h->slice_type_nos == FF_B_TYPE && v){
6290                     v=0;
6291                     for( l = 0; !v && l < 2; l++ ) {
6292                         int ln= 1-l;
6293                         v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6294                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6295                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit;
6296                     }
6297                 }
6298
6299                 bS[0] = bS[1] = bS[2] = bS[3] = v;
6300                 mv_done = 1;
6301             }
6302             else
6303                 mv_done = 0;
6304
6305             for( i = 0; i < 4; i++ ) {
6306                 int x = dir == 0 ? edge : i;
6307                 int y = dir == 0 ? i    : edge;
6308                 int b_idx= 8 + 4 + x + 8*y;
6309                 int bn_idx= b_idx - (dir ? 8:1);
6310
6311                 if( h->non_zero_count_cache[b_idx] |
6312                     h->non_zero_count_cache[bn_idx] ) {
6313                     bS[i] = 2;
6314                 }
6315                 else if(!mv_done)
6316                 {
6317                     bS[i] = 0;
6318                     for( l = 0; l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6319                         if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6320                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6321                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6322                             bS[i] = 1;
6323                             break;
6324                         }
6325                     }
6326
6327                     if(h->slice_type_nos == FF_B_TYPE && bS[i]){
6328                         bS[i] = 0;
6329                         for( l = 0; l < 2; l++ ) {
6330                             int ln= 1-l;
6331                             if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6332                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6333                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) {
6334                                 bS[i] = 1;
6335                                 break;
6336                             }
6337                         }
6338                     }
6339                 }
6340             }
6341
6342             if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6343                 continue;
6344         }
6345
6346         /* Filter edge */
6347         // Do not use s->qscale as luma quantizer because it has not the same
6348         // value in IPCM macroblocks.
6349         qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6350         //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6351         tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6352         { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6353         if( dir == 0 ) {
6354             filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6355             if( (edge&1) == 0 ) {
6356                 filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6357                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6358                 filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6359                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6360             }
6361         } else {
6362             filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6363             if( (edge&1) == 0 ) {
6364                 filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6365                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6366                 filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6367                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6368             }
6369         }
6370     }
6371 }
6372
6373 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6374     MpegEncContext * const s = &h->s;
6375     const int mb_xy= mb_x + mb_y*s->mb_stride;
6376     const int mb_type = s->current_picture.mb_type[mb_xy];
6377     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6378     int first_vertical_edge_done = 0;
6379     int dir;
6380
6381     //for sufficiently low qp, filtering wouldn't do anything
6382     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6383     if(!FRAME_MBAFF){
6384         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
6385         int qp = s->current_picture.qscale_table[mb_xy];
6386         if(qp <= qp_thresh
6387            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6388            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6389             return;
6390         }
6391     }
6392
6393     // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
6394     if(!h->pps.cabac && h->pps.transform_8x8_mode){
6395         int top_type, left_type[2];
6396         top_type     = s->current_picture.mb_type[h->top_mb_xy]    ;
6397         left_type[0] = s->current_picture.mb_type[h->left_mb_xy[0]];
6398         left_type[1] = s->current_picture.mb_type[h->left_mb_xy[1]];
6399
6400         if(IS_8x8DCT(top_type)){
6401             h->non_zero_count_cache[4+8*0]=
6402             h->non_zero_count_cache[5+8*0]= h->cbp_table[h->top_mb_xy] & 4;
6403             h->non_zero_count_cache[6+8*0]=
6404             h->non_zero_count_cache[7+8*0]= h->cbp_table[h->top_mb_xy] & 8;
6405         }
6406         if(IS_8x8DCT(left_type[0])){
6407             h->non_zero_count_cache[3+8*1]=
6408             h->non_zero_count_cache[3+8*2]= h->cbp_table[h->left_mb_xy[0]]&2; //FIXME check MBAFF
6409         }
6410         if(IS_8x8DCT(left_type[1])){
6411             h->non_zero_count_cache[3+8*3]=
6412             h->non_zero_count_cache[3+8*4]= h->cbp_table[h->left_mb_xy[1]]&8; //FIXME check MBAFF
6413         }
6414
6415         if(IS_8x8DCT(mb_type)){
6416             h->non_zero_count_cache[scan8[0   ]]= h->non_zero_count_cache[scan8[1   ]]=
6417             h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp & 1;
6418
6419             h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
6420             h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp & 2;
6421
6422             h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
6423             h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp & 4;
6424
6425             h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
6426             h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp & 8;
6427         }
6428     }
6429
6430     if (FRAME_MBAFF
6431             // left mb is in picture
6432             && h->slice_table[mb_xy-1] != 0xFFFF
6433             // and current and left pair do not have the same interlaced type
6434             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6435             // and left mb is in the same slice if deblocking_filter == 2
6436             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6437         /* First vertical edge is different in MBAFF frames
6438          * There are 8 different bS to compute and 2 different Qp
6439          */
6440         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6441         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6442         int16_t bS[8];
6443         int qp[2];
6444         int bqp[2];
6445         int rqp[2];
6446         int mb_qp, mbn0_qp, mbn1_qp;
6447         int i;
6448         first_vertical_edge_done = 1;
6449
6450         if( IS_INTRA(mb_type) )
6451             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6452         else {
6453             for( i = 0; i < 8; i++ ) {
6454                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6455
6456                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6457                     bS[i] = 4;
6458                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6459                          ((!h->pps.cabac && IS_8x8DCT(s->current_picture.mb_type[mbn_xy])) ?
6460                             (h->cbp_table[mbn_xy] & ((MB_FIELD ? (i&2) : (mb_y&1)) ? 8 : 2))
6461                                                                        :
6462                             h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2]))
6463                     bS[i] = 2;
6464                 else
6465                     bS[i] = 1;
6466             }
6467         }
6468
6469         mb_qp = s->current_picture.qscale_table[mb_xy];
6470         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6471         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6472         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6473         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6474                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6475         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6476                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6477         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6478         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6479                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6480         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6481                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6482
6483         /* Filter edge */
6484         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6485         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6486         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6487         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6488         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6489     }
6490
6491 #ifdef CONFIG_SMALL
6492     for( dir = 0; dir < 2; dir++ )
6493         filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, dir ? 0 : first_vertical_edge_done, dir);
6494 #else
6495     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, first_vertical_edge_done, 0);
6496     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, 0, 1);
6497 #endif
6498 }
6499
6500 static int decode_slice(struct AVCodecContext *avctx, void *arg){
6501     H264Context *h = *(void**)arg;
6502     MpegEncContext * const s = &h->s;
6503     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6504
6505     s->mb_skip_run= -1;
6506
6507     h->is_complex = FRAME_MBAFF || s->picture_structure != PICT_FRAME || s->codec_id != CODEC_ID_H264 ||
6508                     (ENABLE_GRAY && (s->flags&CODEC_FLAG_GRAY)) || (ENABLE_H264_ENCODER && s->encoding);
6509
6510     if( h->pps.cabac ) {
6511         int i;
6512
6513         /* realign */
6514         align_get_bits( &s->gb );
6515
6516         /* init cabac */
6517         ff_init_cabac_states( &h->cabac);
6518         ff_init_cabac_decoder( &h->cabac,
6519                                s->gb.buffer + get_bits_count(&s->gb)/8,
6520                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6521         /* calculate pre-state */
6522         for( i= 0; i < 460; i++ ) {
6523             int pre;
6524             if( h->slice_type_nos == FF_I_TYPE )
6525                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6526             else
6527                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6528
6529             if( pre <= 63 )
6530                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6531             else
6532                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6533         }
6534
6535         for(;;){
6536 //START_TIMER
6537             int ret = decode_mb_cabac(h);
6538             int eos;
6539 //STOP_TIMER("decode_mb_cabac")
6540
6541             if(ret>=0) hl_decode_mb(h);
6542
6543             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6544                 s->mb_y++;
6545
6546                 ret = decode_mb_cabac(h);
6547
6548                 if(ret>=0) hl_decode_mb(h);
6549                 s->mb_y--;
6550             }
6551             eos = get_cabac_terminate( &h->cabac );
6552
6553             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6554                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6555                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6556                 return -1;
6557             }
6558
6559             if( ++s->mb_x >= s->mb_width ) {
6560                 s->mb_x = 0;
6561                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6562                 ++s->mb_y;
6563                 if(FIELD_OR_MBAFF_PICTURE) {
6564                     ++s->mb_y;
6565                 }
6566             }
6567
6568             if( eos || s->mb_y >= s->mb_height ) {
6569                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6570                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6571                 return 0;
6572             }
6573         }
6574
6575     } else {
6576         for(;;){
6577             int ret = decode_mb_cavlc(h);
6578
6579             if(ret>=0) hl_decode_mb(h);
6580
6581             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6582                 s->mb_y++;
6583                 ret = decode_mb_cavlc(h);
6584
6585                 if(ret>=0) hl_decode_mb(h);
6586                 s->mb_y--;
6587             }
6588
6589             if(ret<0){
6590                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6591                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6592
6593                 return -1;
6594             }
6595
6596             if(++s->mb_x >= s->mb_width){
6597                 s->mb_x=0;
6598                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6599                 ++s->mb_y;
6600                 if(FIELD_OR_MBAFF_PICTURE) {
6601                     ++s->mb_y;
6602                 }
6603                 if(s->mb_y >= s->mb_height){
6604                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6605
6606                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6607                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6608
6609                         return 0;
6610                     }else{
6611                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6612
6613                         return -1;
6614                     }
6615                 }
6616             }
6617
6618             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6619                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6620                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6621                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6622
6623                     return 0;
6624                 }else{
6625                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6626
6627                     return -1;
6628                 }
6629             }
6630         }
6631     }
6632
6633 #if 0
6634     for(;s->mb_y < s->mb_height; s->mb_y++){
6635         for(;s->mb_x < s->mb_width; s->mb_x++){
6636             int ret= decode_mb(h);
6637
6638             hl_decode_mb(h);
6639
6640             if(ret<0){
6641                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6642                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6643
6644                 return -1;
6645             }
6646
6647             if(++s->mb_x >= s->mb_width){
6648                 s->mb_x=0;
6649                 if(++s->mb_y >= s->mb_height){
6650                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6651                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6652
6653                         return 0;
6654                     }else{
6655                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6656
6657                         return -1;
6658                     }
6659                 }
6660             }
6661
6662             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6663                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6664                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6665
6666                     return 0;
6667                 }else{
6668                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6669
6670                     return -1;
6671                 }
6672             }
6673         }
6674         s->mb_x=0;
6675         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6676     }
6677 #endif
6678     return -1; //not reached
6679 }
6680
6681 static int decode_picture_timing(H264Context *h){
6682     MpegEncContext * const s = &h->s;
6683     if(h->sps.nal_hrd_parameters_present_flag || h->sps.vcl_hrd_parameters_present_flag){
6684         skip_bits(&s->gb, h->sps.cpb_removal_delay_length); /* cpb_removal_delay */
6685         skip_bits(&s->gb, h->sps.dpb_output_delay_length);  /* dpb_output_delay */
6686     }
6687     if(h->sps.pic_struct_present_flag){
6688         unsigned int i, num_clock_ts;
6689         h->sei_pic_struct = get_bits(&s->gb, 4);
6690
6691         if (h->sei_pic_struct > SEI_PIC_STRUCT_FRAME_TRIPLING)
6692             return -1;
6693
6694         num_clock_ts = sei_num_clock_ts_table[h->sei_pic_struct];
6695
6696         for (i = 0 ; i < num_clock_ts ; i++){
6697             if(get_bits(&s->gb, 1)){                  /* clock_timestamp_flag */
6698                 unsigned int full_timestamp_flag;
6699                 skip_bits(&s->gb, 2);                 /* ct_type */
6700                 skip_bits(&s->gb, 1);                 /* nuit_field_based_flag */
6701                 skip_bits(&s->gb, 5);                 /* counting_type */
6702                 full_timestamp_flag = get_bits(&s->gb, 1);
6703                 skip_bits(&s->gb, 1);                 /* discontinuity_flag */
6704                 skip_bits(&s->gb, 1);                 /* cnt_dropped_flag */
6705                 skip_bits(&s->gb, 8);                 /* n_frames */
6706                 if(full_timestamp_flag){
6707                     skip_bits(&s->gb, 6);             /* seconds_value 0..59 */
6708                     skip_bits(&s->gb, 6);             /* minutes_value 0..59 */
6709                     skip_bits(&s->gb, 5);             /* hours_value 0..23 */
6710                 }else{
6711                     if(get_bits(&s->gb, 1)){          /* seconds_flag */
6712                         skip_bits(&s->gb, 6);         /* seconds_value range 0..59 */
6713                         if(get_bits(&s->gb, 1)){      /* minutes_flag */
6714                             skip_bits(&s->gb, 6);     /* minutes_value 0..59 */
6715                             if(get_bits(&s->gb, 1))   /* hours_flag */
6716                                 skip_bits(&s->gb, 5); /* hours_value 0..23 */
6717                         }
6718                     }
6719                 }
6720                 if(h->sps.time_offset_length > 0)
6721                     skip_bits(&s->gb, h->sps.time_offset_length); /* time_offset */
6722             }
6723         }
6724     }
6725     return 0;
6726 }
6727
6728 static int decode_unregistered_user_data(H264Context *h, int size){
6729     MpegEncContext * const s = &h->s;
6730     uint8_t user_data[16+256];
6731     int e, build, i;
6732
6733     if(size<16)
6734         return -1;
6735
6736     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6737         user_data[i]= get_bits(&s->gb, 8);
6738     }
6739
6740     user_data[i]= 0;
6741     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6742     if(e==1 && build>=0)
6743         h->x264_build= build;
6744
6745     if(s->avctx->debug & FF_DEBUG_BUGS)
6746         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6747
6748     for(; i<size; i++)
6749         skip_bits(&s->gb, 8);
6750
6751     return 0;
6752 }
6753
6754 static int decode_sei(H264Context *h){
6755     MpegEncContext * const s = &h->s;
6756
6757     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6758         int size, type;
6759
6760         type=0;
6761         do{
6762             type+= show_bits(&s->gb, 8);
6763         }while(get_bits(&s->gb, 8) == 255);
6764
6765         size=0;
6766         do{
6767             size+= show_bits(&s->gb, 8);
6768         }while(get_bits(&s->gb, 8) == 255);
6769
6770         switch(type){
6771         case 1: // Picture timing SEI
6772             if(decode_picture_timing(h) < 0)
6773                 return -1;
6774             break;
6775         case 5:
6776             if(decode_unregistered_user_data(h, size) < 0)
6777                 return -1;
6778             break;
6779         default:
6780             skip_bits(&s->gb, 8*size);
6781         }
6782
6783         //FIXME check bits here
6784         align_get_bits(&s->gb);
6785     }
6786
6787     return 0;
6788 }
6789
6790 static inline int decode_hrd_parameters(H264Context *h, SPS *sps){
6791     MpegEncContext * const s = &h->s;
6792     int cpb_count, i;
6793     cpb_count = get_ue_golomb(&s->gb) + 1;
6794
6795     if(cpb_count > 32U){
6796         av_log(h->s.avctx, AV_LOG_ERROR, "cpb_count %d invalid\n", cpb_count);
6797         return -1;
6798     }
6799
6800     get_bits(&s->gb, 4); /* bit_rate_scale */
6801     get_bits(&s->gb, 4); /* cpb_size_scale */
6802     for(i=0; i<cpb_count; i++){
6803         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
6804         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
6805         get_bits1(&s->gb);     /* cbr_flag */
6806     }
6807     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
6808     sps->cpb_removal_delay_length = get_bits(&s->gb, 5) + 1;
6809     sps->dpb_output_delay_length = get_bits(&s->gb, 5) + 1;
6810     sps->time_offset_length = get_bits(&s->gb, 5);
6811     return 0;
6812 }
6813
6814 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
6815     MpegEncContext * const s = &h->s;
6816     int aspect_ratio_info_present_flag;
6817     unsigned int aspect_ratio_idc;
6818
6819     aspect_ratio_info_present_flag= get_bits1(&s->gb);
6820
6821     if( aspect_ratio_info_present_flag ) {
6822         aspect_ratio_idc= get_bits(&s->gb, 8);
6823         if( aspect_ratio_idc == EXTENDED_SAR ) {
6824             sps->sar.num= get_bits(&s->gb, 16);
6825             sps->sar.den= get_bits(&s->gb, 16);
6826         }else if(aspect_ratio_idc < FF_ARRAY_ELEMS(pixel_aspect)){
6827             sps->sar=  pixel_aspect[aspect_ratio_idc];
6828         }else{
6829             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
6830             return -1;
6831         }
6832     }else{
6833         sps->sar.num=
6834         sps->sar.den= 0;
6835     }
6836 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
6837
6838     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
6839         get_bits1(&s->gb);      /* overscan_appropriate_flag */
6840     }
6841
6842     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
6843         get_bits(&s->gb, 3);    /* video_format */
6844         get_bits1(&s->gb);      /* video_full_range_flag */
6845         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
6846             get_bits(&s->gb, 8); /* colour_primaries */
6847             get_bits(&s->gb, 8); /* transfer_characteristics */
6848             get_bits(&s->gb, 8); /* matrix_coefficients */
6849         }
6850     }
6851
6852     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
6853         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
6854         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
6855     }
6856
6857     sps->timing_info_present_flag = get_bits1(&s->gb);
6858     if(sps->timing_info_present_flag){
6859         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
6860         sps->time_scale = get_bits_long(&s->gb, 32);
6861         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
6862     }
6863
6864     sps->nal_hrd_parameters_present_flag = get_bits1(&s->gb);
6865     if(sps->nal_hrd_parameters_present_flag)
6866         if(decode_hrd_parameters(h, sps) < 0)
6867             return -1;
6868     sps->vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
6869     if(sps->vcl_hrd_parameters_present_flag)
6870         if(decode_hrd_parameters(h, sps) < 0)
6871             return -1;
6872     if(sps->nal_hrd_parameters_present_flag || sps->vcl_hrd_parameters_present_flag)
6873         get_bits1(&s->gb);     /* low_delay_hrd_flag */
6874     sps->pic_struct_present_flag = get_bits1(&s->gb);
6875
6876     sps->bitstream_restriction_flag = get_bits1(&s->gb);
6877     if(sps->bitstream_restriction_flag){
6878         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
6879         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
6880         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
6881         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
6882         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
6883         sps->num_reorder_frames= get_ue_golomb(&s->gb);
6884         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
6885
6886         if(sps->num_reorder_frames > 16U /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
6887             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", sps->num_reorder_frames);
6888             return -1;
6889         }
6890     }
6891
6892     return 0;
6893 }
6894
6895 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
6896                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
6897     MpegEncContext * const s = &h->s;
6898     int i, last = 8, next = 8;
6899     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
6900     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
6901         memcpy(factors, fallback_list, size*sizeof(uint8_t));
6902     else
6903     for(i=0;i<size;i++){
6904         if(next)
6905             next = (last + get_se_golomb(&s->gb)) & 0xff;
6906         if(!i && !next){ /* matrix not written, we use the preset one */
6907             memcpy(factors, jvt_list, size*sizeof(uint8_t));
6908             break;
6909         }
6910         last = factors[scan[i]] = next ? next : last;
6911     }
6912 }
6913
6914 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
6915                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
6916     MpegEncContext * const s = &h->s;
6917     int fallback_sps = !is_sps && sps->scaling_matrix_present;
6918     const uint8_t *fallback[4] = {
6919         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
6920         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
6921         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
6922         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
6923     };
6924     if(get_bits1(&s->gb)){
6925         sps->scaling_matrix_present |= is_sps;
6926         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
6927         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
6928         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
6929         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
6930         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
6931         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
6932         if(is_sps || pps->transform_8x8_mode){
6933             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
6934             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
6935         }
6936     }
6937 }
6938
6939 static inline int decode_seq_parameter_set(H264Context *h){
6940     MpegEncContext * const s = &h->s;
6941     int profile_idc, level_idc;
6942     unsigned int sps_id;
6943     int i;
6944     SPS *sps;
6945
6946     profile_idc= get_bits(&s->gb, 8);
6947     get_bits1(&s->gb);   //constraint_set0_flag
6948     get_bits1(&s->gb);   //constraint_set1_flag
6949     get_bits1(&s->gb);   //constraint_set2_flag
6950     get_bits1(&s->gb);   //constraint_set3_flag
6951     get_bits(&s->gb, 4); // reserved
6952     level_idc= get_bits(&s->gb, 8);
6953     sps_id= get_ue_golomb(&s->gb);
6954
6955     if(sps_id >= MAX_SPS_COUNT) {
6956         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id (%d) out of range\n", sps_id);
6957         return -1;
6958     }
6959     sps= av_mallocz(sizeof(SPS));
6960     if(sps == NULL)
6961         return -1;
6962
6963     sps->profile_idc= profile_idc;
6964     sps->level_idc= level_idc;
6965
6966     memset(sps->scaling_matrix4, 16, sizeof(sps->scaling_matrix4));
6967     memset(sps->scaling_matrix8, 16, sizeof(sps->scaling_matrix8));
6968     sps->scaling_matrix_present = 0;
6969
6970     if(sps->profile_idc >= 100){ //high profile
6971         sps->chroma_format_idc= get_ue_golomb(&s->gb);
6972         if(sps->chroma_format_idc == 3)
6973             get_bits1(&s->gb);  //residual_color_transform_flag
6974         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
6975         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
6976         sps->transform_bypass = get_bits1(&s->gb);
6977         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
6978     }else{
6979         sps->chroma_format_idc= 1;
6980     }
6981
6982     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
6983     sps->poc_type= get_ue_golomb(&s->gb);
6984
6985     if(sps->poc_type == 0){ //FIXME #define
6986         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
6987     } else if(sps->poc_type == 1){//FIXME #define
6988         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
6989         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
6990         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
6991         sps->poc_cycle_length                = get_ue_golomb(&s->gb);
6992
6993         if((unsigned)sps->poc_cycle_length >= FF_ARRAY_ELEMS(sps->offset_for_ref_frame)){
6994             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", sps->poc_cycle_length);
6995             goto fail;
6996         }
6997
6998         for(i=0; i<sps->poc_cycle_length; i++)
6999             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7000     }else if(sps->poc_type != 2){
7001         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7002         goto fail;
7003     }
7004
7005     sps->ref_frame_count= get_ue_golomb(&s->gb);
7006     if(sps->ref_frame_count > MAX_PICTURE_COUNT-2 || sps->ref_frame_count >= 32U){
7007         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7008         goto fail;
7009     }
7010     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7011     sps->mb_width = get_ue_golomb(&s->gb) + 1;
7012     sps->mb_height= get_ue_golomb(&s->gb) + 1;
7013     if((unsigned)sps->mb_width >= INT_MAX/16 || (unsigned)sps->mb_height >= INT_MAX/16 ||
7014        avcodec_check_dimensions(NULL, 16*sps->mb_width, 16*sps->mb_height)){
7015         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7016         goto fail;
7017     }
7018
7019     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7020     if(!sps->frame_mbs_only_flag)
7021         sps->mb_aff= get_bits1(&s->gb);
7022     else
7023         sps->mb_aff= 0;
7024
7025     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7026
7027 #ifndef ALLOW_INTERLACE
7028     if(sps->mb_aff)
7029         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7030 #endif
7031     sps->crop= get_bits1(&s->gb);
7032     if(sps->crop){
7033         sps->crop_left  = get_ue_golomb(&s->gb);
7034         sps->crop_right = get_ue_golomb(&s->gb);
7035         sps->crop_top   = get_ue_golomb(&s->gb);
7036         sps->crop_bottom= get_ue_golomb(&s->gb);
7037         if(sps->crop_left || sps->crop_top){
7038             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7039         }
7040         if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !sps->frame_mbs_only_flag)){
7041             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
7042         }
7043     }else{
7044         sps->crop_left  =
7045         sps->crop_right =
7046         sps->crop_top   =
7047         sps->crop_bottom= 0;
7048     }
7049
7050     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7051     if( sps->vui_parameters_present_flag )
7052         decode_vui_parameters(h, sps);
7053
7054     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7055         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s %s\n",
7056                sps_id, sps->profile_idc, sps->level_idc,
7057                sps->poc_type,
7058                sps->ref_frame_count,
7059                sps->mb_width, sps->mb_height,
7060                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7061                sps->direct_8x8_inference_flag ? "8B8" : "",
7062                sps->crop_left, sps->crop_right,
7063                sps->crop_top, sps->crop_bottom,
7064                sps->vui_parameters_present_flag ? "VUI" : "",
7065                ((const char*[]){"Gray","420","422","444"})[sps->chroma_format_idc]
7066                );
7067     }
7068     av_free(h->sps_buffers[sps_id]);
7069     h->sps_buffers[sps_id]= sps;
7070     return 0;
7071 fail:
7072     av_free(sps);
7073     return -1;
7074 }
7075
7076 static void
7077 build_qp_table(PPS *pps, int t, int index)
7078 {
7079     int i;
7080     for(i = 0; i < 52; i++)
7081         pps->chroma_qp_table[t][i] = chroma_qp[av_clip(i + index, 0, 51)];
7082 }
7083
7084 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7085     MpegEncContext * const s = &h->s;
7086     unsigned int pps_id= get_ue_golomb(&s->gb);
7087     PPS *pps;
7088
7089     if(pps_id >= MAX_PPS_COUNT) {
7090         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id (%d) out of range\n", pps_id);
7091         return -1;
7092     }
7093
7094     pps= av_mallocz(sizeof(PPS));
7095     if(pps == NULL)
7096         return -1;
7097     pps->sps_id= get_ue_golomb(&s->gb);
7098     if((unsigned)pps->sps_id>=MAX_SPS_COUNT || h->sps_buffers[pps->sps_id] == NULL){
7099         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7100         goto fail;
7101     }
7102
7103     pps->cabac= get_bits1(&s->gb);
7104     pps->pic_order_present= get_bits1(&s->gb);
7105     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7106     if(pps->slice_group_count > 1 ){
7107         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7108         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7109         switch(pps->mb_slice_group_map_type){
7110         case 0:
7111 #if 0
7112 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7113 |    run_length[ i ]                                |1  |ue(v)   |
7114 #endif
7115             break;
7116         case 2:
7117 #if 0
7118 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7119 |{                                                  |   |        |
7120 |    top_left_mb[ i ]                               |1  |ue(v)   |
7121 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7122 |   }                                               |   |        |
7123 #endif
7124             break;
7125         case 3:
7126         case 4:
7127         case 5:
7128 #if 0
7129 |   slice_group_change_direction_flag               |1  |u(1)    |
7130 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7131 #endif
7132             break;
7133         case 6:
7134 #if 0
7135 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7136 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7137 |)                                                  |   |        |
7138 |    slice_group_id[ i ]                            |1  |u(v)    |
7139 #endif
7140             break;
7141         }
7142     }
7143     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7144     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7145     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7146         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7147         goto fail;
7148     }
7149
7150     pps->weighted_pred= get_bits1(&s->gb);
7151     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7152     pps->init_qp= get_se_golomb(&s->gb) + 26;
7153     pps->init_qs= get_se_golomb(&s->gb) + 26;
7154     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7155     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7156     pps->constrained_intra_pred= get_bits1(&s->gb);
7157     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7158
7159     pps->transform_8x8_mode= 0;
7160     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7161     memcpy(pps->scaling_matrix4, h->sps_buffers[pps->sps_id]->scaling_matrix4, sizeof(pps->scaling_matrix4));
7162     memcpy(pps->scaling_matrix8, h->sps_buffers[pps->sps_id]->scaling_matrix8, sizeof(pps->scaling_matrix8));
7163
7164     if(get_bits_count(&s->gb) < bit_length){
7165         pps->transform_8x8_mode= get_bits1(&s->gb);
7166         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7167         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7168     } else {
7169         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7170     }
7171
7172     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7173     build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7174     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1])
7175         h->pps.chroma_qp_diff= 1;
7176
7177     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7178         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7179                pps_id, pps->sps_id,
7180                pps->cabac ? "CABAC" : "CAVLC",
7181                pps->slice_group_count,
7182                pps->ref_count[0], pps->ref_count[1],
7183                pps->weighted_pred ? "weighted" : "",
7184                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7185                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7186                pps->constrained_intra_pred ? "CONSTR" : "",
7187                pps->redundant_pic_cnt_present ? "REDU" : "",
7188                pps->transform_8x8_mode ? "8x8DCT" : ""
7189                );
7190     }
7191
7192     av_free(h->pps_buffers[pps_id]);
7193     h->pps_buffers[pps_id]= pps;
7194     return 0;
7195 fail:
7196     av_free(pps);
7197     return -1;
7198 }
7199
7200 /**
7201  * Call decode_slice() for each context.
7202  *
7203  * @param h h264 master context
7204  * @param context_count number of contexts to execute
7205  */
7206 static void execute_decode_slices(H264Context *h, int context_count){
7207     MpegEncContext * const s = &h->s;
7208     AVCodecContext * const avctx= s->avctx;
7209     H264Context *hx;
7210     int i;
7211
7212     if(context_count == 1) {
7213         decode_slice(avctx, &h);
7214     } else {
7215         for(i = 1; i < context_count; i++) {
7216             hx = h->thread_context[i];
7217             hx->s.error_recognition = avctx->error_recognition;
7218             hx->s.error_count = 0;
7219         }
7220
7221         avctx->execute(avctx, (void *)decode_slice,
7222                        (void **)h->thread_context, NULL, context_count, sizeof(void*));
7223
7224         /* pull back stuff from slices to master context */
7225         hx = h->thread_context[context_count - 1];
7226         s->mb_x = hx->s.mb_x;
7227         s->mb_y = hx->s.mb_y;
7228         s->dropable = hx->s.dropable;
7229         s->picture_structure = hx->s.picture_structure;
7230         for(i = 1; i < context_count; i++)
7231             h->s.error_count += h->thread_context[i]->s.error_count;
7232     }
7233 }
7234
7235
7236 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7237     MpegEncContext * const s = &h->s;
7238     AVCodecContext * const avctx= s->avctx;
7239     int buf_index=0;
7240     H264Context *hx; ///< thread context
7241     int context_count = 0;
7242
7243     h->max_contexts = avctx->thread_count;
7244 #if 0
7245     int i;
7246     for(i=0; i<50; i++){
7247         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7248     }
7249 #endif
7250     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7251         h->current_slice = 0;
7252         if (!s->first_field)
7253             s->current_picture_ptr= NULL;
7254     }
7255
7256     for(;;){
7257         int consumed;
7258         int dst_length;
7259         int bit_length;
7260         const uint8_t *ptr;
7261         int i, nalsize = 0;
7262         int err;
7263
7264         if(h->is_avc) {
7265             if(buf_index >= buf_size) break;
7266             nalsize = 0;
7267             for(i = 0; i < h->nal_length_size; i++)
7268                 nalsize = (nalsize << 8) | buf[buf_index++];
7269             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7270                 if(nalsize == 1){
7271                     buf_index++;
7272                     continue;
7273                 }else{
7274                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7275                     break;
7276                 }
7277             }
7278         } else {
7279             // start code prefix search
7280             for(; buf_index + 3 < buf_size; buf_index++){
7281                 // This should always succeed in the first iteration.
7282                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7283                     break;
7284             }
7285
7286             if(buf_index+3 >= buf_size) break;
7287
7288             buf_index+=3;
7289         }
7290
7291         hx = h->thread_context[context_count];
7292
7293         ptr= decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7294         if (ptr==NULL || dst_length < 0){
7295             return -1;
7296         }
7297         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7298             dst_length--;
7299         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
7300
7301         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7302             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7303         }
7304
7305         if (h->is_avc && (nalsize != consumed)){
7306             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7307             consumed= nalsize;
7308         }
7309
7310         buf_index += consumed;
7311
7312         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7313            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7314             continue;
7315
7316       again:
7317         err = 0;
7318         switch(hx->nal_unit_type){
7319         case NAL_IDR_SLICE:
7320             if (h->nal_unit_type != NAL_IDR_SLICE) {
7321                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7322                 return -1;
7323             }
7324             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7325         case NAL_SLICE:
7326             init_get_bits(&hx->s.gb, ptr, bit_length);
7327             hx->intra_gb_ptr=
7328             hx->inter_gb_ptr= &hx->s.gb;
7329             hx->s.data_partitioning = 0;
7330
7331             if((err = decode_slice_header(hx, h)))
7332                break;
7333
7334             s->current_picture_ptr->key_frame|= (hx->nal_unit_type == NAL_IDR_SLICE);
7335             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7336                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7337                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7338                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7339                && avctx->skip_frame < AVDISCARD_ALL)
7340                 context_count++;
7341             break;
7342         case NAL_DPA:
7343             init_get_bits(&hx->s.gb, ptr, bit_length);
7344             hx->intra_gb_ptr=
7345             hx->inter_gb_ptr= NULL;
7346             hx->s.data_partitioning = 1;
7347
7348             err = decode_slice_header(hx, h);
7349             break;
7350         case NAL_DPB:
7351             init_get_bits(&hx->intra_gb, ptr, bit_length);
7352             hx->intra_gb_ptr= &hx->intra_gb;
7353             break;
7354         case NAL_DPC:
7355             init_get_bits(&hx->inter_gb, ptr, bit_length);
7356             hx->inter_gb_ptr= &hx->inter_gb;
7357
7358             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7359                && s->context_initialized
7360                && s->hurry_up < 5
7361                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7362                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7363                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7364                && avctx->skip_frame < AVDISCARD_ALL)
7365                 context_count++;
7366             break;
7367         case NAL_SEI:
7368             init_get_bits(&s->gb, ptr, bit_length);
7369             decode_sei(h);
7370             break;
7371         case NAL_SPS:
7372             init_get_bits(&s->gb, ptr, bit_length);
7373             decode_seq_parameter_set(h);
7374
7375             if(s->flags& CODEC_FLAG_LOW_DELAY)
7376                 s->low_delay=1;
7377
7378             if(avctx->has_b_frames < 2)
7379                 avctx->has_b_frames= !s->low_delay;
7380             break;
7381         case NAL_PPS:
7382             init_get_bits(&s->gb, ptr, bit_length);
7383
7384             decode_picture_parameter_set(h, bit_length);
7385
7386             break;
7387         case NAL_AUD:
7388         case NAL_END_SEQUENCE:
7389         case NAL_END_STREAM:
7390         case NAL_FILLER_DATA:
7391         case NAL_SPS_EXT:
7392         case NAL_AUXILIARY_SLICE:
7393             break;
7394         default:
7395             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7396         }
7397
7398         if(context_count == h->max_contexts) {
7399             execute_decode_slices(h, context_count);
7400             context_count = 0;
7401         }
7402
7403         if (err < 0)
7404             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7405         else if(err == 1) {
7406             /* Slice could not be decoded in parallel mode, copy down
7407              * NAL unit stuff to context 0 and restart. Note that
7408              * rbsp_buffer is not transferred, but since we no longer
7409              * run in parallel mode this should not be an issue. */
7410             h->nal_unit_type = hx->nal_unit_type;
7411             h->nal_ref_idc   = hx->nal_ref_idc;
7412             hx = h;
7413             goto again;
7414         }
7415     }
7416     if(context_count)
7417         execute_decode_slices(h, context_count);
7418     return buf_index;
7419 }
7420
7421 /**
7422  * returns the number of bytes consumed for building the current frame
7423  */
7424 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7425         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7426         if(pos+10>buf_size) pos=buf_size; // oops ;)
7427
7428         return pos;
7429 }
7430
7431 static int decode_frame(AVCodecContext *avctx,
7432                              void *data, int *data_size,
7433                              const uint8_t *buf, int buf_size)
7434 {
7435     H264Context *h = avctx->priv_data;
7436     MpegEncContext *s = &h->s;
7437     AVFrame *pict = data;
7438     int buf_index;
7439
7440     s->flags= avctx->flags;
7441     s->flags2= avctx->flags2;
7442
7443    /* end of stream, output what is still in the buffers */
7444     if (buf_size == 0) {
7445         Picture *out;
7446         int i, out_idx;
7447
7448 //FIXME factorize this with the output code below
7449         out = h->delayed_pic[0];
7450         out_idx = 0;
7451         for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7452             if(h->delayed_pic[i]->poc < out->poc){
7453                 out = h->delayed_pic[i];
7454                 out_idx = i;
7455             }
7456
7457         for(i=out_idx; h->delayed_pic[i]; i++)
7458             h->delayed_pic[i] = h->delayed_pic[i+1];
7459
7460         if(out){
7461             *data_size = sizeof(AVFrame);
7462             *pict= *(AVFrame*)out;
7463         }
7464
7465         return 0;
7466     }
7467
7468     if(h->is_avc && !h->got_avcC) {
7469         int i, cnt, nalsize;
7470         unsigned char *p = avctx->extradata;
7471         if(avctx->extradata_size < 7) {
7472             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7473             return -1;
7474         }
7475         if(*p != 1) {
7476             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7477             return -1;
7478         }
7479         /* sps and pps in the avcC always have length coded with 2 bytes,
7480            so put a fake nal_length_size = 2 while parsing them */
7481         h->nal_length_size = 2;
7482         // Decode sps from avcC
7483         cnt = *(p+5) & 0x1f; // Number of sps
7484         p += 6;
7485         for (i = 0; i < cnt; i++) {
7486             nalsize = AV_RB16(p) + 2;
7487             if(decode_nal_units(h, p, nalsize) < 0) {
7488                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7489                 return -1;
7490             }
7491             p += nalsize;
7492         }
7493         // Decode pps from avcC
7494         cnt = *(p++); // Number of pps
7495         for (i = 0; i < cnt; i++) {
7496             nalsize = AV_RB16(p) + 2;
7497             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7498                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7499                 return -1;
7500             }
7501             p += nalsize;
7502         }
7503         // Now store right nal length size, that will be use to parse all other nals
7504         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7505         // Do not reparse avcC
7506         h->got_avcC = 1;
7507     }
7508
7509     if(!h->got_avcC && !h->is_avc && s->avctx->extradata_size){
7510         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7511             return -1;
7512         h->got_avcC = 1;
7513     }
7514
7515     buf_index=decode_nal_units(h, buf, buf_size);
7516     if(buf_index < 0)
7517         return -1;
7518
7519     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7520         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7521         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7522         return -1;
7523     }
7524
7525     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7526         Picture *out = s->current_picture_ptr;
7527         Picture *cur = s->current_picture_ptr;
7528         int i, pics, cross_idr, out_of_order, out_idx;
7529
7530         s->mb_y= 0;
7531
7532         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7533         s->current_picture_ptr->pict_type= s->pict_type;
7534
7535         if(!s->dropable) {
7536             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7537             h->prev_poc_msb= h->poc_msb;
7538             h->prev_poc_lsb= h->poc_lsb;
7539         }
7540         h->prev_frame_num_offset= h->frame_num_offset;
7541         h->prev_frame_num= h->frame_num;
7542
7543         /*
7544          * FIXME: Error handling code does not seem to support interlaced
7545          * when slices span multiple rows
7546          * The ff_er_add_slice calls don't work right for bottom
7547          * fields; they cause massive erroneous error concealing
7548          * Error marking covers both fields (top and bottom).
7549          * This causes a mismatched s->error_count
7550          * and a bad error table. Further, the error count goes to
7551          * INT_MAX when called for bottom field, because mb_y is
7552          * past end by one (callers fault) and resync_mb_y != 0
7553          * causes problems for the first MB line, too.
7554          */
7555         if (!FIELD_PICTURE)
7556             ff_er_frame_end(s);
7557
7558         MPV_frame_end(s);
7559
7560         if (cur->field_poc[0]==INT_MAX || cur->field_poc[1]==INT_MAX) {
7561             /* Wait for second field. */
7562             *data_size = 0;
7563
7564         } else {
7565             cur->repeat_pict = 0;
7566
7567             /* Signal interlacing information externally. */
7568             /* Prioritize picture timing SEI information over used decoding process if it exists. */
7569             if(h->sps.pic_struct_present_flag){
7570                 switch (h->sei_pic_struct)
7571                 {
7572                 case SEI_PIC_STRUCT_FRAME:
7573                     cur->interlaced_frame = 0;
7574                     break;
7575                 case SEI_PIC_STRUCT_TOP_FIELD:
7576                 case SEI_PIC_STRUCT_BOTTOM_FIELD:
7577                 case SEI_PIC_STRUCT_TOP_BOTTOM:
7578                 case SEI_PIC_STRUCT_BOTTOM_TOP:
7579                     cur->interlaced_frame = 1;
7580                     break;
7581                 case SEI_PIC_STRUCT_TOP_BOTTOM_TOP:
7582                 case SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM:
7583                     // Signal the possibility of telecined film externally (pic_struct 5,6)
7584                     // From these hints, let the applications decide if they apply deinterlacing.
7585                     cur->repeat_pict = 1;
7586                     cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7587                     break;
7588                 case SEI_PIC_STRUCT_FRAME_DOUBLING:
7589                     // Force progressive here, as doubling interlaced frame is a bad idea.
7590                     cur->interlaced_frame = 0;
7591                     cur->repeat_pict = 2;
7592                     break;
7593                 case SEI_PIC_STRUCT_FRAME_TRIPLING:
7594                     cur->interlaced_frame = 0;
7595                     cur->repeat_pict = 4;
7596                     break;
7597                 }
7598             }else{
7599                 /* Derive interlacing flag from used decoding process. */
7600                 cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7601             }
7602
7603             if (cur->field_poc[0] != cur->field_poc[1]){
7604                 /* Derive top_field_first from field pocs. */
7605                 cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7606             }else{
7607                 if(cur->interlaced_frame || h->sps.pic_struct_present_flag){
7608                     /* Use picture timing SEI information. Even if it is a information of a past frame, better than nothing. */
7609                     if(h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM
7610                       || h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM_TOP)
7611                         cur->top_field_first = 1;
7612                     else
7613                         cur->top_field_first = 0;
7614                 }else{
7615                     /* Most likely progressive */
7616                     cur->top_field_first = 0;
7617                 }
7618             }
7619
7620         //FIXME do something with unavailable reference frames
7621
7622             /* Sort B-frames into display order */
7623
7624             if(h->sps.bitstream_restriction_flag
7625                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7626                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7627                 s->low_delay = 0;
7628             }
7629
7630             if(   s->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT
7631                && !h->sps.bitstream_restriction_flag){
7632                 s->avctx->has_b_frames= MAX_DELAYED_PIC_COUNT;
7633                 s->low_delay= 0;
7634             }
7635
7636             pics = 0;
7637             while(h->delayed_pic[pics]) pics++;
7638
7639             assert(pics <= MAX_DELAYED_PIC_COUNT);
7640
7641             h->delayed_pic[pics++] = cur;
7642             if(cur->reference == 0)
7643                 cur->reference = DELAYED_PIC_REF;
7644
7645             out = h->delayed_pic[0];
7646             out_idx = 0;
7647             for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7648                 if(h->delayed_pic[i]->poc < out->poc){
7649                     out = h->delayed_pic[i];
7650                     out_idx = i;
7651                 }
7652             cross_idr = !h->delayed_pic[0]->poc || !!h->delayed_pic[i] || h->delayed_pic[0]->key_frame;
7653
7654             out_of_order = !cross_idr && out->poc < h->outputed_poc;
7655
7656             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7657                 { }
7658             else if((out_of_order && pics-1 == s->avctx->has_b_frames && s->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT)
7659                || (s->low_delay &&
7660                 ((!cross_idr && out->poc > h->outputed_poc + 2)
7661                  || cur->pict_type == FF_B_TYPE)))
7662             {
7663                 s->low_delay = 0;
7664                 s->avctx->has_b_frames++;
7665             }
7666
7667             if(out_of_order || pics > s->avctx->has_b_frames){
7668                 out->reference &= ~DELAYED_PIC_REF;
7669                 for(i=out_idx; h->delayed_pic[i]; i++)
7670                     h->delayed_pic[i] = h->delayed_pic[i+1];
7671             }
7672             if(!out_of_order && pics > s->avctx->has_b_frames){
7673                 *data_size = sizeof(AVFrame);
7674
7675                 h->outputed_poc = out->poc;
7676                 *pict= *(AVFrame*)out;
7677             }else{
7678                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7679             }
7680         }
7681     }
7682
7683     assert(pict->data[0] || !*data_size);
7684     ff_print_debug_info(s, pict);
7685 //printf("out %d\n", (int)pict->data[0]);
7686 #if 0 //?
7687
7688     /* Return the Picture timestamp as the frame number */
7689     /* we subtract 1 because it is added on utils.c     */
7690     avctx->frame_number = s->picture_number - 1;
7691 #endif
7692     return get_consumed_bytes(s, buf_index, buf_size);
7693 }
7694 #if 0
7695 static inline void fill_mb_avail(H264Context *h){
7696     MpegEncContext * const s = &h->s;
7697     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7698
7699     if(s->mb_y){
7700         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7701         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7702         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7703     }else{
7704         h->mb_avail[0]=
7705         h->mb_avail[1]=
7706         h->mb_avail[2]= 0;
7707     }
7708     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7709     h->mb_avail[4]= 1; //FIXME move out
7710     h->mb_avail[5]= 0; //FIXME move out
7711 }
7712 #endif
7713
7714 #ifdef TEST
7715 #undef printf
7716 #undef random
7717 #define COUNT 8000
7718 #define SIZE (COUNT*40)
7719 int main(void){
7720     int i;
7721     uint8_t temp[SIZE];
7722     PutBitContext pb;
7723     GetBitContext gb;
7724 //    int int_temp[10000];
7725     DSPContext dsp;
7726     AVCodecContext avctx;
7727
7728     dsputil_init(&dsp, &avctx);
7729
7730     init_put_bits(&pb, temp, SIZE);
7731     printf("testing unsigned exp golomb\n");
7732     for(i=0; i<COUNT; i++){
7733         START_TIMER
7734         set_ue_golomb(&pb, i);
7735         STOP_TIMER("set_ue_golomb");
7736     }
7737     flush_put_bits(&pb);
7738
7739     init_get_bits(&gb, temp, 8*SIZE);
7740     for(i=0; i<COUNT; i++){
7741         int j, s;
7742
7743         s= show_bits(&gb, 24);
7744
7745         START_TIMER
7746         j= get_ue_golomb(&gb);
7747         if(j != i){
7748             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7749 //            return -1;
7750         }
7751         STOP_TIMER("get_ue_golomb");
7752     }
7753
7754
7755     init_put_bits(&pb, temp, SIZE);
7756     printf("testing signed exp golomb\n");
7757     for(i=0; i<COUNT; i++){
7758         START_TIMER
7759         set_se_golomb(&pb, i - COUNT/2);
7760         STOP_TIMER("set_se_golomb");
7761     }
7762     flush_put_bits(&pb);
7763
7764     init_get_bits(&gb, temp, 8*SIZE);
7765     for(i=0; i<COUNT; i++){
7766         int j, s;
7767
7768         s= show_bits(&gb, 24);
7769
7770         START_TIMER
7771         j= get_se_golomb(&gb);
7772         if(j != i - COUNT/2){
7773             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7774 //            return -1;
7775         }
7776         STOP_TIMER("get_se_golomb");
7777     }
7778
7779 #if 0
7780     printf("testing 4x4 (I)DCT\n");
7781
7782     DCTELEM block[16];
7783     uint8_t src[16], ref[16];
7784     uint64_t error= 0, max_error=0;
7785
7786     for(i=0; i<COUNT; i++){
7787         int j;
7788 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
7789         for(j=0; j<16; j++){
7790             ref[j]= random()%255;
7791             src[j]= random()%255;
7792         }
7793
7794         h264_diff_dct_c(block, src, ref, 4);
7795
7796         //normalize
7797         for(j=0; j<16; j++){
7798 //            printf("%d ", block[j]);
7799             block[j]= block[j]*4;
7800             if(j&1) block[j]= (block[j]*4 + 2)/5;
7801             if(j&4) block[j]= (block[j]*4 + 2)/5;
7802         }
7803 //        printf("\n");
7804
7805         s->dsp.h264_idct_add(ref, block, 4);
7806 /*        for(j=0; j<16; j++){
7807             printf("%d ", ref[j]);
7808         }
7809         printf("\n");*/
7810
7811         for(j=0; j<16; j++){
7812             int diff= FFABS(src[j] - ref[j]);
7813
7814             error+= diff*diff;
7815             max_error= FFMAX(max_error, diff);
7816         }
7817     }
7818     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
7819     printf("testing quantizer\n");
7820     for(qp=0; qp<52; qp++){
7821         for(i=0; i<16; i++)
7822             src1_block[i]= src2_block[i]= random()%255;
7823
7824     }
7825     printf("Testing NAL layer\n");
7826
7827     uint8_t bitstream[COUNT];
7828     uint8_t nal[COUNT*2];
7829     H264Context h;
7830     memset(&h, 0, sizeof(H264Context));
7831
7832     for(i=0; i<COUNT; i++){
7833         int zeros= i;
7834         int nal_length;
7835         int consumed;
7836         int out_length;
7837         uint8_t *out;
7838         int j;
7839
7840         for(j=0; j<COUNT; j++){
7841             bitstream[j]= (random() % 255) + 1;
7842         }
7843
7844         for(j=0; j<zeros; j++){
7845             int pos= random() % COUNT;
7846             while(bitstream[pos] == 0){
7847                 pos++;
7848                 pos %= COUNT;
7849             }
7850             bitstream[pos]=0;
7851         }
7852
7853         START_TIMER
7854
7855         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
7856         if(nal_length<0){
7857             printf("encoding failed\n");
7858             return -1;
7859         }
7860
7861         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
7862
7863         STOP_TIMER("NAL")
7864
7865         if(out_length != COUNT){
7866             printf("incorrect length %d %d\n", out_length, COUNT);
7867             return -1;
7868         }
7869
7870         if(consumed != nal_length){
7871             printf("incorrect consumed length %d %d\n", nal_length, consumed);
7872             return -1;
7873         }
7874
7875         if(memcmp(bitstream, out, COUNT)){
7876             printf("mismatch\n");
7877             return -1;
7878         }
7879     }
7880 #endif
7881
7882     printf("Testing RBSP\n");
7883
7884
7885     return 0;
7886 }
7887 #endif /* TEST */
7888
7889
7890 static av_cold int decode_end(AVCodecContext *avctx)
7891 {
7892     H264Context *h = avctx->priv_data;
7893     MpegEncContext *s = &h->s;
7894     int i;
7895
7896     av_freep(&h->rbsp_buffer[0]);
7897     av_freep(&h->rbsp_buffer[1]);
7898     free_tables(h); //FIXME cleanup init stuff perhaps
7899
7900     for(i = 0; i < MAX_SPS_COUNT; i++)
7901         av_freep(h->sps_buffers + i);
7902
7903     for(i = 0; i < MAX_PPS_COUNT; i++)
7904         av_freep(h->pps_buffers + i);
7905
7906     MPV_common_end(s);
7907
7908 //    memset(h, 0, sizeof(H264Context));
7909
7910     return 0;
7911 }
7912
7913
7914 AVCodec h264_decoder = {
7915     "h264",
7916     CODEC_TYPE_VIDEO,
7917     CODEC_ID_H264,
7918     sizeof(H264Context),
7919     decode_init,
7920     NULL,
7921     decode_end,
7922     decode_frame,
7923     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_DELAY,
7924     .flush= flush_dpb,
7925     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
7926 };
7927
7928 #include "svq3.c"