libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264.h"
  32 #include "h264data.h"
  33 #include "h264_parser.h"
  34 #include "golomb.h"
  35 #include "rectangle.h"
  36
  37 #include "cabac.h"
  38 #ifdef ARCH_X86
  39 #include "i386/h264_i386.h"
  40 #endif
  41
  42 //#undef NDEBUG
  43 #include <assert.h>
  44
  45 /**
  46  * Value of Picture.reference when Picture is not a reference picture, but
  47  * is held for delayed output.
  48  */
  49 #define DELAYED_PIC_REF 4
  50
  51 static VLC coeff_token_vlc[4];
  52 static VLC_TYPE coeff_token_vlc_tables[520+332+280+256][2];
  53 static const int coeff_token_vlc_tables_size[4]={520,332,280,256};
  54
  55 static VLC chroma_dc_coeff_token_vlc;
  56 static VLC_TYPE chroma_dc_coeff_token_vlc_table[256][2];
  57 static const int chroma_dc_coeff_token_vlc_table_size = 256;
  58
  59 static VLC total_zeros_vlc[15];
  60 static VLC_TYPE total_zeros_vlc_tables[15][512][2];
  61 static const int total_zeros_vlc_tables_size = 512;
  62
  63 static VLC chroma_dc_total_zeros_vlc[3];
  64 static VLC_TYPE chroma_dc_total_zeros_vlc_tables[3][8][2];
  65 static const int chroma_dc_total_zeros_vlc_tables_size = 8;
  66
  67 static VLC run_vlc[6];
  68 static VLC_TYPE run_vlc_tables[6][8][2];
  69 static const int run_vlc_tables_size = 8;
  70
  71 static VLC run7_vlc;
  72 static VLC_TYPE run7_vlc_table[96][2];
  73 static const int run7_vlc_table_size = 96;
  74
  75 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  76 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  77 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  78 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  79 static Picture * remove_long(H264Context *h, int i, int ref_mask);
  80
  81 static av_always_inline uint32_t pack16to32(int a, int b){
  82 #ifdef WORDS_BIGENDIAN
  83    return (b&0xFFFF) + (a<<16);
  84 #else
  85    return (a&0xFFFF) + (b<<16);
  86 #endif
  87 }
  88
  89 static const uint8_t rem6[52]={
  90 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  91 };
  92
  93 static const uint8_t div6[52]={
  94 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  95 };
  96
  97 static const int left_block_options[4][8]={
  98     {0,1,2,3,7,10,8,11},
  99     {2,2,3,3,8,11,8,11},
 100     {0,0,1,1,7,10,7,10},
 101     {0,2,0,2,7,10,7,10}
 102 };
 103
 104 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 105     MpegEncContext * const s = &h->s;
 106     const int mb_xy= h->mb_xy;
 107     int topleft_xy, top_xy, topright_xy, left_xy[2];
 108     int topleft_type, top_type, topright_type, left_type[2];
 109     const int * left_block;
 110     int topleft_partition= -1;
 111     int i;
 112
 113     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
 114
 115     //FIXME deblocking could skip the intra and nnz parts.
 116     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
 117         return;
 118
 119     /* Wow, what a mess, why didn't they simplify the interlacing & intra
 120      * stuff, I can't imagine that these complex rules are worth it. */
 121
 122     topleft_xy = top_xy - 1;
 123     topright_xy= top_xy + 1;
 124     left_xy[1] = left_xy[0] = mb_xy-1;
 125     left_block = left_block_options[0];
 126     if(FRAME_MBAFF){
 127         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 128         const int top_pair_xy      = pair_xy     - s->mb_stride;
 129         const int topleft_pair_xy  = top_pair_xy - 1;
 130         const int topright_pair_xy = top_pair_xy + 1;
 131         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 132         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 133         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 134         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 135         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 136         const int bottom = (s->mb_y & 1);
 137         tprintf(s->avctx, "fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 138         if (bottom
 139                 ? !curr_mb_frame_flag // bottom macroblock
 140                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 141                 ) {
 142             top_xy -= s->mb_stride;
 143         }
 144         if (bottom
 145                 ? !curr_mb_frame_flag // bottom macroblock
 146                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 147                 ) {
 148             topleft_xy -= s->mb_stride;
 149         } else if(bottom && curr_mb_frame_flag && !left_mb_frame_flag) {
 150             topleft_xy += s->mb_stride;
 151             // take top left mv from the middle of the mb, as opposed to all other modes which use the bottom right partition
 152             topleft_partition = 0;
 153         }
 154         if (bottom
 155                 ? !curr_mb_frame_flag // bottom macroblock
 156                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 157                 ) {
 158             topright_xy -= s->mb_stride;
 159         }
 160         if (left_mb_frame_flag != curr_mb_frame_flag) {
 161             left_xy[1] = left_xy[0] = pair_xy - 1;
 162             if (curr_mb_frame_flag) {
 163                 if (bottom) {
 164                     left_block = left_block_options[1];
 165                 } else {
 166                     left_block= left_block_options[2];
 167                 }
 168             } else {
 169                 left_xy[1] += s->mb_stride;
 170                 left_block = left_block_options[3];
 171             }
 172         }
 173     }
 174
 175     h->top_mb_xy = top_xy;
 176     h->left_mb_xy[0] = left_xy[0];
 177     h->left_mb_xy[1] = left_xy[1];
 178     if(for_deblock){
 179         topleft_type = 0;
 180         topright_type = 0;
 181         top_type     = h->slice_table[top_xy     ] < 0xFFFF ? s->current_picture.mb_type[top_xy]     : 0;
 182         left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0;
 183         left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0;
 184
 185         if(MB_MBAFF && !IS_INTRA(mb_type)){
 186             int list;
 187             for(list=0; list<h->list_count; list++){
 188                 //These values where changed for ease of performing MC, we need to change them back
 189                 //FIXME maybe we can make MC and loop filter use the same values or prevent
 190                 //the MC code from changing ref_cache and rather use a temporary array.
 191                 if(USES_LIST(mb_type,list)){
 192                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 193                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 194                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 195                     ref += h->b8_stride;
 196                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 197                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 198                 }
 199             }
 200         }
 201     }else{
 202         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 203         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 204         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 205         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 206         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 207
 208     if(IS_INTRA(mb_type)){
 209         int type_mask= h->pps.constrained_intra_pred ? IS_INTRA(-1) : -1;
 210         h->topleft_samples_available=
 211         h->top_samples_available=
 212         h->left_samples_available= 0xFFFF;
 213         h->topright_samples_available= 0xEEEA;
 214
 215         if(!(top_type & type_mask)){
 216             h->topleft_samples_available= 0xB3FF;
 217             h->top_samples_available= 0x33FF;
 218             h->topright_samples_available= 0x26EA;
 219         }
 220         if(IS_INTERLACED(mb_type) != IS_INTERLACED(left_type[0])){
 221             if(IS_INTERLACED(mb_type)){
 222                 if(!(left_type[0] & type_mask)){
 223                     h->topleft_samples_available&= 0xDFFF;
 224                     h->left_samples_available&= 0x5FFF;
 225                 }
 226                 if(!(left_type[1] & type_mask)){
 227                     h->topleft_samples_available&= 0xFF5F;
 228                     h->left_samples_available&= 0xFF5F;
 229                 }
 230             }else{
 231                 int left_typei = h->slice_table[left_xy[0] + s->mb_stride ] == h->slice_num
 232                                 ? s->current_picture.mb_type[left_xy[0] + s->mb_stride] : 0;
 233                 assert(left_xy[0] == left_xy[1]);
 234                 if(!((left_typei & type_mask) && (left_type[0] & type_mask))){
 235                     h->topleft_samples_available&= 0xDF5F;
 236                     h->left_samples_available&= 0x5F5F;
 237                 }
 238             }
 239         }else{
 240             if(!(left_type[0] & type_mask)){
 241                 h->topleft_samples_available&= 0xDF5F;
 242                 h->left_samples_available&= 0x5F5F;
 243             }
 244         }
 245
 246         if(!(topleft_type & type_mask))
 247             h->topleft_samples_available&= 0x7FFF;
 248
 249         if(!(topright_type & type_mask))
 250             h->topright_samples_available&= 0xFBFF;
 251
 252         if(IS_INTRA4x4(mb_type)){
 253             if(IS_INTRA4x4(top_type)){
 254                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 255                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 256                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 257                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 258             }else{
 259                 int pred;
 260                 if(!(top_type & type_mask))
 261                     pred= -1;
 262                 else{
 263                     pred= 2;
 264                 }
 265                 h->intra4x4_pred_mode_cache[4+8*0]=
 266                 h->intra4x4_pred_mode_cache[5+8*0]=
 267                 h->intra4x4_pred_mode_cache[6+8*0]=
 268                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 269             }
 270             for(i=0; i<2; i++){
 271                 if(IS_INTRA4x4(left_type[i])){
 272                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 273                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 274                 }else{
 275                     int pred;
 276                     if(!(left_type[i] & type_mask))
 277                         pred= -1;
 278                     else{
 279                         pred= 2;
 280                     }
 281                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 282                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 283                 }
 284             }
 285         }
 286     }
 287     }
 288
 289
 290 /*
 291 0 . T T. T T T T
 292 1 L . .L . . . .
 293 2 L . .L . . . .
 294 3 . T TL . . . .
 295 4 L . .L . . . .
 296 5 L . .. . . . .
 297 */
 298 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
 299     if(top_type){
 300         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 301         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 302         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 303         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 304
 305         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 306         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 307
 308         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 309         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 310
 311     }else{
 312         h->non_zero_count_cache[4+8*0]=
 313         h->non_zero_count_cache[5+8*0]=
 314         h->non_zero_count_cache[6+8*0]=
 315         h->non_zero_count_cache[7+8*0]=
 316
 317         h->non_zero_count_cache[1+8*0]=
 318         h->non_zero_count_cache[2+8*0]=
 319
 320         h->non_zero_count_cache[1+8*3]=
 321         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 322
 323     }
 324
 325     for (i=0; i<2; i++) {
 326         if(left_type[i]){
 327             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 328             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 329             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 330             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 331         }else{
 332             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 333             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 334             h->non_zero_count_cache[0+8*1 +   8*i]=
 335             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 336         }
 337     }
 338
 339     if( h->pps.cabac ) {
 340         // top_cbp
 341         if(top_type) {
 342             h->top_cbp = h->cbp_table[top_xy];
 343         } else if(IS_INTRA(mb_type)) {
 344             h->top_cbp = 0x1C0;
 345         } else {
 346             h->top_cbp = 0;
 347         }
 348         // left_cbp
 349         if (left_type[0]) {
 350             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 351         } else if(IS_INTRA(mb_type)) {
 352             h->left_cbp = 0x1C0;
 353         } else {
 354             h->left_cbp = 0;
 355         }
 356         if (left_type[0]) {
 357             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 358         }
 359         if (left_type[1]) {
 360             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 361         }
 362     }
 363
 364 #if 1
 365     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 366         int list;
 367         for(list=0; list<h->list_count; list++){
 368             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 369                 /*if(!h->mv_cache_clean[list]){
 370                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 371                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 372                     h->mv_cache_clean[list]= 1;
 373                 }*/
 374                 continue;
 375             }
 376             h->mv_cache_clean[list]= 0;
 377
 378             if(USES_LIST(top_type, list)){
 379                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 380                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 381                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 382                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 383                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 384                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 385                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 386                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 387                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 388                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 389             }else{
 390                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 391                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 392                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 393                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 394                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 395             }
 396
 397             for(i=0; i<2; i++){
 398                 int cache_idx = scan8[0] - 1 + i*2*8;
 399                 if(USES_LIST(left_type[i], list)){
 400                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 401                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 402                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 403                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 404                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 405                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 406                 }else{
 407                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 408                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 409                     h->ref_cache[list][cache_idx  ]=
 410                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 411                 }
 412             }
 413
 414             if(for_deblock || ((IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred) && !FRAME_MBAFF))
 415                 continue;
 416
 417             if(USES_LIST(topleft_type, list)){
 418                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 419                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 420                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 421                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 422             }else{
 423                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 424                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 425             }
 426
 427             if(USES_LIST(topright_type, list)){
 428                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 429                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 430                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 431                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 432             }else{
 433                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 434                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 435             }
 436
 437             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 438                 continue;
 439
 440             h->ref_cache[list][scan8[5 ]+1] =
 441             h->ref_cache[list][scan8[7 ]+1] =
 442             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 443             h->ref_cache[list][scan8[4 ]] =
 444             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 445             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 446             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 447             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 448             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 449             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 450
 451             if( h->pps.cabac ) {
 452                 /* XXX beurk, Load mvd */
 453                 if(USES_LIST(top_type, list)){
 454                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 455                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 456                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 457                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 458                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 459                 }else{
 460                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 461                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 462                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 463                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 464                 }
 465                 if(USES_LIST(left_type[0], list)){
 466                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 467                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 468                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 469                 }else{
 470                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 471                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 472                 }
 473                 if(USES_LIST(left_type[1], list)){
 474                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 475                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 476                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 477                 }else{
 478                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 479                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 480                 }
 481                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 482                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 483                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 484                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 485                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 486
 487                 if(h->slice_type_nos == FF_B_TYPE){
 488                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 489
 490                     if(IS_DIRECT(top_type)){
 491                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 492                     }else if(IS_8X8(top_type)){
 493                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 494                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 495                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 496                     }else{
 497                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 498                     }
 499
 500                     if(IS_DIRECT(left_type[0]))
 501                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 502                     else if(IS_8X8(left_type[0]))
 503                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 504                     else
 505                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 506
 507                     if(IS_DIRECT(left_type[1]))
 508                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 509                     else if(IS_8X8(left_type[1]))
 510                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 511                     else
 512                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 513                 }
 514             }
 515
 516             if(FRAME_MBAFF){
 517 #define MAP_MVS\
 518                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 519                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 520                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 521                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 522                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 523                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 524                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 525                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 526                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 527                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 528                 if(MB_FIELD){
 529 #define MAP_F2F(idx, mb_type)\
 530                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 531                         h->ref_cache[list][idx] <<= 1;\
 532                         h->mv_cache[list][idx][1] /= 2;\
 533                         h->mvd_cache[list][idx][1] /= 2;\
 534                     }
 535                     MAP_MVS
 536 #undef MAP_F2F
 537                 }else{
 538 #define MAP_F2F(idx, mb_type)\
 539                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 540                         h->ref_cache[list][idx] >>= 1;\
 541                         h->mv_cache[list][idx][1] <<= 1;\
 542                         h->mvd_cache[list][idx][1] <<= 1;\
 543                     }
 544                     MAP_MVS
 545 #undef MAP_F2F
 546                 }
 547             }
 548         }
 549     }
 550 #endif
 551
 552     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 553 }
 554
 555 static inline void write_back_intra_pred_mode(H264Context *h){
 556     const int mb_xy= h->mb_xy;
 557
 558     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 559     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 560     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 561     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 562     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 563     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 564     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 565 }
 566
 567 /**
 568  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 569  */
 570 static inline int check_intra4x4_pred_mode(H264Context *h){
 571     MpegEncContext * const s = &h->s;
 572     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 573     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 574     int i;
 575
 576     if(!(h->top_samples_available&0x8000)){
 577         for(i=0; i<4; i++){
 578             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 579             if(status<0){
 580                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 581                 return -1;
 582             } else if(status){
 583                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 584             }
 585         }
 586     }
 587
 588     if((h->left_samples_available&0x8888)!=0x8888){
 589         static const int mask[4]={0x8000,0x2000,0x80,0x20};
 590         for(i=0; i<4; i++){
 591             if(!(h->left_samples_available&mask[i])){
 592             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 593             if(status<0){
 594                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 595                 return -1;
 596             } else if(status){
 597                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 598             }
 599             }
 600         }
 601     }
 602
 603     return 0;
 604 } //FIXME cleanup like next
 605
 606 /**
 607  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 608  */
 609 static inline int check_intra_pred_mode(H264Context *h, int mode){
 610     MpegEncContext * const s = &h->s;
 611     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 612     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 613
 614     if(mode > 6U) {
 615         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 616         return -1;
 617     }
 618
 619     if(!(h->top_samples_available&0x8000)){
 620         mode= top[ mode ];
 621         if(mode<0){
 622             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 623             return -1;
 624         }
 625     }
 626
 627     if((h->left_samples_available&0x8080) != 0x8080){
 628         mode= left[ mode ];
 629         if(h->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred
 630             mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(h->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8);
 631         }
 632         if(mode<0){
 633             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 634             return -1;
 635         }
 636     }
 637
 638     return mode;
 639 }
 640
 641 /**
 642  * gets the predicted intra4x4 prediction mode.
 643  */
 644 static inline int pred_intra_mode(H264Context *h, int n){
 645     const int index8= scan8[n];
 646     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 647     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 648     const int min= FFMIN(left, top);
 649
 650     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 651
 652     if(min<0) return DC_PRED;
 653     else      return min;
 654 }
 655
 656 static inline void write_back_non_zero_count(H264Context *h){
 657     const int mb_xy= h->mb_xy;
 658
 659     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 660     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 661     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 662     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 663     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 664     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 665     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 666
 667     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 668     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 669     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 670
 671     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 672     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 673     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 674 }
 675
 676 /**
 677  * gets the predicted number of non-zero coefficients.
 678  * @param n block index
 679  */
 680 static inline int pred_non_zero_count(H264Context *h, int n){
 681     const int index8= scan8[n];
 682     const int left= h->non_zero_count_cache[index8 - 1];
 683     const int top = h->non_zero_count_cache[index8 - 8];
 684     int i= left + top;
 685
 686     if(i<64) i= (i+1)>>1;
 687
 688     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 689
 690     return i&31;
 691 }
 692
 693 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 694     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 695     MpegEncContext *s = &h->s;
 696
 697     /* there is no consistent mapping of mvs to neighboring locations that will
 698      * make mbaff happy, so we can't move all this logic to fill_caches */
 699     if(FRAME_MBAFF){
 700         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 701         const int16_t *mv;
 702         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 703         *C = h->mv_cache[list][scan8[0]-2];
 704
 705         if(!MB_FIELD
 706            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 707             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 708             if(IS_INTERLACED(mb_types[topright_xy])){
 709 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 710                 const int x4 = X4, y4 = Y4;\
 711                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 712                 if(!USES_LIST(mb_type,list))\
 713                     return LIST_NOT_USED;\
 714                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 715                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 716                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 717                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 718
 719                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 720             }
 721         }
 722         if(topright_ref == PART_NOT_AVAILABLE
 723            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 724            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 725             if(!MB_FIELD
 726                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 727                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 728             }
 729             if(MB_FIELD
 730                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 731                && i >= scan8[0]+8){
 732                 // left shift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's OK.
 733                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 734             }
 735         }
 736 #undef SET_DIAG_MV
 737     }
 738
 739     if(topright_ref != PART_NOT_AVAILABLE){
 740         *C= h->mv_cache[list][ i - 8 + part_width ];
 741         return topright_ref;
 742     }else{
 743         tprintf(s->avctx, "topright MV not available\n");
 744
 745         *C= h->mv_cache[list][ i - 8 - 1 ];
 746         return h->ref_cache[list][ i - 8 - 1 ];
 747     }
 748 }
 749
 750 /**
 751  * gets the predicted MV.
 752  * @param n the block index
 753  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 754  * @param mx the x component of the predicted motion vector
 755  * @param my the y component of the predicted motion vector
 756  */
 757 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 758     const int index8= scan8[n];
 759     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 760     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 761     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 762     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 763     const int16_t * C;
 764     int diagonal_ref, match_count;
 765
 766     assert(part_width==1 || part_width==2 || part_width==4);
 767
 768 /* mv_cache
 769   B . . A T T T T
 770   U . . L . . , .
 771   U . . L . . . .
 772   U . . L . . , .
 773   . . . L . . . .
 774 */
 775
 776     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 777     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 778     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 779     if(match_count > 1){ //most common
 780         *mx= mid_pred(A[0], B[0], C[0]);
 781         *my= mid_pred(A[1], B[1], C[1]);
 782     }else if(match_count==1){
 783         if(left_ref==ref){
 784             *mx= A[0];
 785             *my= A[1];
 786         }else if(top_ref==ref){
 787             *mx= B[0];
 788             *my= B[1];
 789         }else{
 790             *mx= C[0];
 791             *my= C[1];
 792         }
 793     }else{
 794         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 795             *mx= A[0];
 796             *my= A[1];
 797         }else{
 798             *mx= mid_pred(A[0], B[0], C[0]);
 799             *my= mid_pred(A[1], B[1], C[1]);
 800         }
 801     }
 802
 803     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 804 }
 805
 806 /**
 807  * gets the directionally predicted 16x8 MV.
 808  * @param n the block index
 809  * @param mx the x component of the predicted motion vector
 810  * @param my the y component of the predicted motion vector
 811  */
 812 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 813     if(n==0){
 814         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 815         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 816
 817         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 818
 819         if(top_ref == ref){
 820             *mx= B[0];
 821             *my= B[1];
 822             return;
 823         }
 824     }else{
 825         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 826         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 827
 828         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 829
 830         if(left_ref == ref){
 831             *mx= A[0];
 832             *my= A[1];
 833             return;
 834         }
 835     }
 836
 837     //RARE
 838     pred_motion(h, n, 4, list, ref, mx, my);
 839 }
 840
 841 /**
 842  * gets the directionally predicted 8x16 MV.
 843  * @param n the block index
 844  * @param mx the x component of the predicted motion vector
 845  * @param my the y component of the predicted motion vector
 846  */
 847 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 848     if(n==0){
 849         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 850         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 851
 852         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 853
 854         if(left_ref == ref){
 855             *mx= A[0];
 856             *my= A[1];
 857             return;
 858         }
 859     }else{
 860         const int16_t * C;
 861         int diagonal_ref;
 862
 863         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 864
 865         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 866
 867         if(diagonal_ref == ref){
 868             *mx= C[0];
 869             *my= C[1];
 870             return;
 871         }
 872     }
 873
 874     //RARE
 875     pred_motion(h, n, 2, list, ref, mx, my);
 876 }
 877
 878 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 879     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 880     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 881
 882     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 883
 884     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 885        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
 886        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
 887
 888         *mx = *my = 0;
 889         return;
 890     }
 891
 892     pred_motion(h, 0, 4, 0, 0, mx, my);
 893
 894     return;
 895 }
 896
 897 static int get_scale_factor(H264Context * const h, int poc, int poc1, int i){
 898     int poc0 = h->ref_list[0][i].poc;
 899     int td = av_clip(poc1 - poc0, -128, 127);
 900     if(td == 0 || h->ref_list[0][i].long_ref){
 901         return 256;
 902     }else{
 903         int tb = av_clip(poc - poc0, -128, 127);
 904         int tx = (16384 + (FFABS(td) >> 1)) / td;
 905         return av_clip((tb*tx + 32) >> 6, -1024, 1023);
 906     }
 907 }
 908
 909 static inline void direct_dist_scale_factor(H264Context * const h){
 910     MpegEncContext * const s = &h->s;
 911     const int poc = h->s.current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
 912     const int poc1 = h->ref_list[1][0].poc;
 913     int i, field;
 914     for(field=0; field<2; field++){
 915         const int poc  = h->s.current_picture_ptr->field_poc[field];
 916         const int poc1 = h->ref_list[1][0].field_poc[field];
 917         for(i=0; i < 2*h->ref_count[0]; i++)
 918             h->dist_scale_factor_field[field][i^field] = get_scale_factor(h, poc, poc1, i+16);
 919     }
 920
 921     for(i=0; i<h->ref_count[0]; i++){
 922         h->dist_scale_factor[i] = get_scale_factor(h, poc, poc1, i);
 923     }
 924 }
 925
 926 static void fill_colmap(H264Context *h, int map[2][16+32], int list, int field, int colfield, int mbafi){
 927     MpegEncContext * const s = &h->s;
 928     Picture * const ref1 = &h->ref_list[1][0];
 929     int j, old_ref, rfield;
 930     int start= mbafi ? 16                      : 0;
 931     int end  = mbafi ? 16+2*h->ref_count[list] : h->ref_count[list];
 932     int interl= mbafi || s->picture_structure != PICT_FRAME;
 933
 934     /* bogus; fills in for missing frames */
 935     memset(map[list], 0, sizeof(map[list]));
 936
 937     for(rfield=0; rfield<2; rfield++){
 938         for(old_ref=0; old_ref<ref1->ref_count[colfield][list]; old_ref++){
 939             int poc = ref1->ref_poc[colfield][list][old_ref];
 940
 941             if     (!interl)
 942                 poc |= 3;
 943             else if( interl && (poc&3) == 3) //FIXME store all MBAFF references so this isnt needed
 944                 poc= (poc&~3) + rfield + 1;
 945
 946             for(j=start; j<end; j++){
 947                 if(4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3) == poc){
 948                     int cur_ref= mbafi ? (j-16)^field : j;
 949                     map[list][2*old_ref + (rfield^field) + 16] = cur_ref;
 950                     if(rfield == field)
 951                         map[list][old_ref] = cur_ref;
 952                     break;
 953                 }
 954             }
 955         }
 956     }
 957 }
 958
 959 static inline void direct_ref_list_init(H264Context * const h){
 960     MpegEncContext * const s = &h->s;
 961     Picture * const ref1 = &h->ref_list[1][0];
 962     Picture * const cur = s->current_picture_ptr;
 963     int list, j, field;
 964     int sidx= (s->picture_structure&1)^1;
 965     int ref1sidx= (ref1->reference&1)^1;
 966
 967     for(list=0; list<2; list++){
 968         cur->ref_count[sidx][list] = h->ref_count[list];
 969         for(j=0; j<h->ref_count[list]; j++)
 970             cur->ref_poc[sidx][list][j] = 4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3);
 971     }
 972
 973     if(s->picture_structure == PICT_FRAME){
 974         memcpy(cur->ref_count[1], cur->ref_count[0], sizeof(cur->ref_count[0]));
 975         memcpy(cur->ref_poc  [1], cur->ref_poc  [0], sizeof(cur->ref_poc  [0]));
 976     }
 977
 978     cur->mbaff= FRAME_MBAFF;
 979
 980     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 981         return;
 982
 983     for(list=0; list<2; list++){
 984         fill_colmap(h, h->map_col_to_list0, list, sidx, ref1sidx, 0);
 985         for(field=0; field<2; field++)
 986             fill_colmap(h, h->map_col_to_list0_field[field], list, field, field, 1);
 987     }
 988 }
 989
 990 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 991     MpegEncContext * const s = &h->s;
 992     int b8_stride = h->b8_stride;
 993     int b4_stride = h->b_stride;
 994     int mb_xy = h->mb_xy;
 995     int mb_type_col[2];
 996     const int16_t (*l1mv0)[2], (*l1mv1)[2];
 997     const int8_t *l1ref0, *l1ref1;
 998     const int is_b8x8 = IS_8X8(*mb_type);
 999     unsigned int sub_mb_type;
1000     int i8, i4;
1001
1002 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
1003
1004     if(IS_INTERLACED(h->ref_list[1][0].mb_type[mb_xy])){ // AFL/AFR/FR/FL -> AFL/FL
1005         if(!IS_INTERLACED(*mb_type)){                    //     AFR/FR    -> AFL/FL
1006             int cur_poc = s->current_picture_ptr->poc;
1007             int *col_poc = h->ref_list[1]->field_poc;
1008             int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
1009             mb_xy= s->mb_x + ((s->mb_y&~1) + col_parity)*s->mb_stride;
1010             b8_stride = 0;
1011         }else if(!(s->picture_structure & h->ref_list[1][0].reference) && !h->ref_list[1][0].mbaff){// FL -> FL & differ parity
1012             int fieldoff= 2*(h->ref_list[1][0].reference)-3;
1013             mb_xy += s->mb_stride*fieldoff;
1014         }
1015         goto single_col;
1016     }else{                                               // AFL/AFR/FR/FL -> AFR/FR
1017         if(IS_INTERLACED(*mb_type)){                     // AFL       /FL -> AFR/FR
1018             mb_xy= s->mb_x + (s->mb_y&~1)*s->mb_stride;
1019             mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy];
1020             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride];
1021             b8_stride *= 3;
1022             b4_stride *= 6;
1023             //FIXME IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag
1024             if(    (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)
1025                 && (mb_type_col[1] & MB_TYPE_16x16_OR_INTRA)
1026                 && !is_b8x8){
1027                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1028                 *mb_type   |= MB_TYPE_16x8 |MB_TYPE_L0L1|MB_TYPE_DIRECT2; /* B_16x8 */
1029             }else{
1030                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1031                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1032             }
1033         }else{                                           //     AFR/FR    -> AFR/FR
1034 single_col:
1035             mb_type_col[0] =
1036             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy];
1037             if(IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag){
1038                 /* FIXME save sub mb types from previous frames (or derive from MVs)
1039                 * so we know exactly what block size to use */
1040                 sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1041                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1042             }else if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
1043                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1044                 *mb_type   |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1045             }else{
1046                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1047                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1048             }
1049         }
1050     }
1051
1052     l1mv0  = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]];
1053     l1mv1  = &h->ref_list[1][0].motion_val[1][h->mb2b_xy [mb_xy]];
1054     l1ref0 = &h->ref_list[1][0].ref_index [0][h->mb2b8_xy[mb_xy]];
1055     l1ref1 = &h->ref_list[1][0].ref_index [1][h->mb2b8_xy[mb_xy]];
1056     if(!b8_stride){
1057         if(s->mb_y&1){
1058             l1ref0 += h->b8_stride;
1059             l1ref1 += h->b8_stride;
1060             l1mv0  +=  2*b4_stride;
1061             l1mv1  +=  2*b4_stride;
1062         }
1063     }
1064
1065     if(h->direct_spatial_mv_pred){
1066         int ref[2];
1067         int mv[2][2];
1068         int list;
1069
1070         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1071
1072         /* ref = min(neighbors) */
1073         for(list=0; list<2; list++){
1074             int refa = h->ref_cache[list][scan8[0] - 1];
1075             int refb = h->ref_cache[list][scan8[0] - 8];
1076             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1077             if(refc == PART_NOT_AVAILABLE)
1078                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1079             ref[list] = FFMIN3((unsigned)refa, (unsigned)refb, (unsigned)refc);
1080             if(ref[list] < 0)
1081                 ref[list] = -1;
1082         }
1083
1084         if(ref[0] < 0 && ref[1] < 0){
1085             ref[0] = ref[1] = 0;
1086             mv[0][0] = mv[0][1] =
1087             mv[1][0] = mv[1][1] = 0;
1088         }else{
1089             for(list=0; list<2; list++){
1090                 if(ref[list] >= 0)
1091                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1092                 else
1093                     mv[list][0] = mv[list][1] = 0;
1094             }
1095         }
1096
1097         if(ref[1] < 0){
1098             if(!is_b8x8)
1099                 *mb_type &= ~MB_TYPE_L1;
1100             sub_mb_type &= ~MB_TYPE_L1;
1101         }else if(ref[0] < 0){
1102             if(!is_b8x8)
1103                 *mb_type &= ~MB_TYPE_L0;
1104             sub_mb_type &= ~MB_TYPE_L0;
1105         }
1106
1107         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1108             for(i8=0; i8<4; i8++){
1109                 int x8 = i8&1;
1110                 int y8 = i8>>1;
1111                 int xy8 = x8+y8*b8_stride;
1112                 int xy4 = 3*x8+y8*b4_stride;
1113                 int a=0, b=0;
1114
1115                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1116                     continue;
1117                 h->sub_mb_type[i8] = sub_mb_type;
1118
1119                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1120                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1121                 if(!IS_INTRA(mb_type_col[y8])
1122                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1123                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1124                     if(ref[0] > 0)
1125                         a= pack16to32(mv[0][0],mv[0][1]);
1126                     if(ref[1] > 0)
1127                         b= pack16to32(mv[1][0],mv[1][1]);
1128                 }else{
1129                     a= pack16to32(mv[0][0],mv[0][1]);
1130                     b= pack16to32(mv[1][0],mv[1][1]);
1131                 }
1132                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1133                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1134             }
1135         }else if(IS_16X16(*mb_type)){
1136             int a=0, b=0;
1137
1138             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1139             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1140             if(!IS_INTRA(mb_type_col[0])
1141                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1142                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1143                        && (h->x264_build>33 || !h->x264_build)))){
1144                 if(ref[0] > 0)
1145                     a= pack16to32(mv[0][0],mv[0][1]);
1146                 if(ref[1] > 0)
1147                     b= pack16to32(mv[1][0],mv[1][1]);
1148             }else{
1149                 a= pack16to32(mv[0][0],mv[0][1]);
1150                 b= pack16to32(mv[1][0],mv[1][1]);
1151             }
1152             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1153             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1154         }else{
1155             for(i8=0; i8<4; i8++){
1156                 const int x8 = i8&1;
1157                 const int y8 = i8>>1;
1158
1159                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1160                     continue;
1161                 h->sub_mb_type[i8] = sub_mb_type;
1162
1163                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1164                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1165                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1166                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1167
1168                 /* col_zero_flag */
1169                 if(!IS_INTRA(mb_type_col[0]) && (   l1ref0[x8 + y8*b8_stride] == 0
1170                                               || (l1ref0[x8 + y8*b8_stride] < 0 && l1ref1[x8 + y8*b8_stride] == 0
1171                                                   && (h->x264_build>33 || !h->x264_build)))){
1172                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*b8_stride] == 0 ? l1mv0 : l1mv1;
1173                     if(IS_SUB_8X8(sub_mb_type)){
1174                         const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1175                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1176                             if(ref[0] == 0)
1177                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1178                             if(ref[1] == 0)
1179                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1180                         }
1181                     }else
1182                     for(i4=0; i4<4; i4++){
1183                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1184                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1185                             if(ref[0] == 0)
1186                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1187                             if(ref[1] == 0)
1188                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1189                         }
1190                     }
1191                 }
1192             }
1193         }
1194     }else{ /* direct temporal mv pred */
1195         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1196         const int *dist_scale_factor = h->dist_scale_factor;
1197         int ref_offset= 0;
1198
1199         if(FRAME_MBAFF && IS_INTERLACED(*mb_type)){
1200             map_col_to_list0[0] = h->map_col_to_list0_field[s->mb_y&1][0];
1201             map_col_to_list0[1] = h->map_col_to_list0_field[s->mb_y&1][1];
1202             dist_scale_factor   =h->dist_scale_factor_field[s->mb_y&1];
1203         }
1204         if(h->ref_list[1][0].mbaff && IS_INTERLACED(mb_type_col[0]))
1205             ref_offset += 16;
1206
1207         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1208             /* FIXME assumes direct_8x8_inference == 1 */
1209             int y_shift  = 2*!IS_INTERLACED(*mb_type);
1210
1211             for(i8=0; i8<4; i8++){
1212                 const int x8 = i8&1;
1213                 const int y8 = i8>>1;
1214                 int ref0, scale;
1215                 const int16_t (*l1mv)[2]= l1mv0;
1216
1217                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1218                     continue;
1219                 h->sub_mb_type[i8] = sub_mb_type;
1220
1221                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1222                 if(IS_INTRA(mb_type_col[y8])){
1223                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1224                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1225                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1226                     continue;
1227                 }
1228
1229                 ref0 = l1ref0[x8 + y8*b8_stride];
1230                 if(ref0 >= 0)
1231                     ref0 = map_col_to_list0[0][ref0 + ref_offset];
1232                 else{
1233                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1234                     l1mv= l1mv1;
1235                 }
1236                 scale = dist_scale_factor[ref0];
1237                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1238
1239                 {
1240                     const int16_t *mv_col = l1mv[x8*3 + y8*b4_stride];
1241                     int my_col = (mv_col[1]<<y_shift)/2;
1242                     int mx = (scale * mv_col[0] + 128) >> 8;
1243                     int my = (scale * my_col + 128) >> 8;
1244                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1245                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1246                 }
1247             }
1248             return;
1249         }
1250
1251         /* one-to-one mv scaling */
1252
1253         if(IS_16X16(*mb_type)){
1254             int ref, mv0, mv1;
1255
1256             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1257             if(IS_INTRA(mb_type_col[0])){
1258                 ref=mv0=mv1=0;
1259             }else{
1260                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0] + ref_offset]
1261                                                 : map_col_to_list0[1][l1ref1[0] + ref_offset];
1262                 const int scale = dist_scale_factor[ref0];
1263                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1264                 int mv_l0[2];
1265                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1266                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1267                 ref= ref0;
1268                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1269                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1270             }
1271             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1272             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1273             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1274         }else{
1275             for(i8=0; i8<4; i8++){
1276                 const int x8 = i8&1;
1277                 const int y8 = i8>>1;
1278                 int ref0, scale;
1279                 const int16_t (*l1mv)[2]= l1mv0;
1280
1281                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1282                     continue;
1283                 h->sub_mb_type[i8] = sub_mb_type;
1284                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1285                 if(IS_INTRA(mb_type_col[0])){
1286                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1287                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1288                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1289                     continue;
1290                 }
1291
1292                 ref0 = l1ref0[x8 + y8*b8_stride] + ref_offset;
1293                 if(ref0 >= 0)
1294                     ref0 = map_col_to_list0[0][ref0];
1295                 else{
1296                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1297                     l1mv= l1mv1;
1298                 }
1299                 scale = dist_scale_factor[ref0];
1300
1301                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1302                 if(IS_SUB_8X8(sub_mb_type)){
1303                     const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1304                     int mx = (scale * mv_col[0] + 128) >> 8;
1305                     int my = (scale * mv_col[1] + 128) >> 8;
1306                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1307                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1308                 }else
1309                 for(i4=0; i4<4; i4++){
1310                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1311                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1312                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1313                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1314                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1315                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1316                 }
1317             }
1318         }
1319     }
1320 }
1321
1322 static inline void write_back_motion(H264Context *h, int mb_type){
1323     MpegEncContext * const s = &h->s;
1324     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1325     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1326     int list;
1327
1328     if(!USES_LIST(mb_type, 0))
1329         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1330
1331     for(list=0; list<h->list_count; list++){
1332         int y;
1333         if(!USES_LIST(mb_type, list))
1334             continue;
1335
1336         for(y=0; y<4; y++){
1337             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1338             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1339         }
1340         if( h->pps.cabac ) {
1341             if(IS_SKIP(mb_type))
1342                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1343             else
1344             for(y=0; y<4; y++){
1345                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1346                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1347             }
1348         }
1349
1350         {
1351             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1352             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1353             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1354             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1355             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1356         }
1357     }
1358
1359     if(h->slice_type_nos == FF_B_TYPE && h->pps.cabac){
1360         if(IS_8X8(mb_type)){
1361             uint8_t *direct_table = &h->direct_table[b8_xy];
1362             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1363             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1364             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1365         }
1366     }
1367 }
1368
1369 /**
1370  * Decodes a network abstraction layer unit.
1371  * @param consumed is the number of bytes used as input
1372  * @param length is the length of the array
1373  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1374  * @returns decoded bytes, might be src+1 if no escapes
1375  */
1376 static const uint8_t *decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1377     int i, si, di;
1378     uint8_t *dst;
1379     int bufidx;
1380
1381 //    src[0]&0x80;                //forbidden bit
1382     h->nal_ref_idc= src[0]>>5;
1383     h->nal_unit_type= src[0]&0x1F;
1384
1385     src++; length--;
1386 #if 0
1387     for(i=0; i<length; i++)
1388         printf("%2X ", src[i]);
1389 #endif
1390     for(i=0; i+1<length; i+=2){
1391         if(src[i]) continue;
1392         if(i>0 && src[i-1]==0) i--;
1393         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1394             if(src[i+2]!=3){
1395                 /* startcode, so we must be past the end */
1396                 length=i;
1397             }
1398             break;
1399         }
1400     }
1401
1402     if(i>=length-1){ //no escaped 0
1403         *dst_length= length;
1404         *consumed= length+1; //+1 for the header
1405         return src;
1406     }
1407
1408     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1409     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length);
1410     dst= h->rbsp_buffer[bufidx];
1411
1412     if (dst == NULL){
1413         return NULL;
1414     }
1415
1416 //printf("decoding esc\n");
1417     si=di=0;
1418     while(si<length){
1419         //remove escapes (very rare 1:2^22)
1420         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1421             if(src[si+2]==3){ //escape
1422                 dst[di++]= 0;
1423                 dst[di++]= 0;
1424                 si+=3;
1425                 continue;
1426             }else //next start code
1427                 break;
1428         }
1429
1430         dst[di++]= src[si++];
1431     }
1432
1433     *dst_length= di;
1434     *consumed= si + 1;//+1 for the header
1435 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1436     return dst;
1437 }
1438
1439 /**
1440  * identifies the exact end of the bitstream
1441  * @return the length of the trailing, or 0 if damaged
1442  */
1443 static int decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1444     int v= *src;
1445     int r;
1446
1447     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1448
1449     for(r=1; r<9; r++){
1450         if(v&1) return r;
1451         v>>=1;
1452     }
1453     return 0;
1454 }
1455
1456 /**
1457  * IDCT transforms the 16 dc values and dequantizes them.
1458  * @param qp quantization parameter
1459  */
1460 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1461 #define stride 16
1462     int i;
1463     int temp[16]; //FIXME check if this is a good idea
1464     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1465     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1466
1467 //memset(block, 64, 2*256);
1468 //return;
1469     for(i=0; i<4; i++){
1470         const int offset= y_offset[i];
1471         const int z0= block[offset+stride*0] + block[offset+stride*4];
1472         const int z1= block[offset+stride*0] - block[offset+stride*4];
1473         const int z2= block[offset+stride*1] - block[offset+stride*5];
1474         const int z3= block[offset+stride*1] + block[offset+stride*5];
1475
1476         temp[4*i+0]= z0+z3;
1477         temp[4*i+1]= z1+z2;
1478         temp[4*i+2]= z1-z2;
1479         temp[4*i+3]= z0-z3;
1480     }
1481
1482     for(i=0; i<4; i++){
1483         const int offset= x_offset[i];
1484         const int z0= temp[4*0+i] + temp[4*2+i];
1485         const int z1= temp[4*0+i] - temp[4*2+i];
1486         const int z2= temp[4*1+i] - temp[4*3+i];
1487         const int z3= temp[4*1+i] + temp[4*3+i];
1488
1489         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
1490         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1491         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1492         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1493     }
1494 }
1495
1496 #if 0
1497 /**
1498  * DCT transforms the 16 dc values.
1499  * @param qp quantization parameter ??? FIXME
1500  */
1501 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1502 //    const int qmul= dequant_coeff[qp][0];
1503     int i;
1504     int temp[16]; //FIXME check if this is a good idea
1505     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1506     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1507
1508     for(i=0; i<4; i++){
1509         const int offset= y_offset[i];
1510         const int z0= block[offset+stride*0] + block[offset+stride*4];
1511         const int z1= block[offset+stride*0] - block[offset+stride*4];
1512         const int z2= block[offset+stride*1] - block[offset+stride*5];
1513         const int z3= block[offset+stride*1] + block[offset+stride*5];
1514
1515         temp[4*i+0]= z0+z3;
1516         temp[4*i+1]= z1+z2;
1517         temp[4*i+2]= z1-z2;
1518         temp[4*i+3]= z0-z3;
1519     }
1520
1521     for(i=0; i<4; i++){
1522         const int offset= x_offset[i];
1523         const int z0= temp[4*0+i] + temp[4*2+i];
1524         const int z1= temp[4*0+i] - temp[4*2+i];
1525         const int z2= temp[4*1+i] - temp[4*3+i];
1526         const int z3= temp[4*1+i] + temp[4*3+i];
1527
1528         block[stride*0 +offset]= (z0 + z3)>>1;
1529         block[stride*2 +offset]= (z1 + z2)>>1;
1530         block[stride*8 +offset]= (z1 - z2)>>1;
1531         block[stride*10+offset]= (z0 - z3)>>1;
1532     }
1533 }
1534 #endif
1535
1536 #undef xStride
1537 #undef stride
1538
1539 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1540     const int stride= 16*2;
1541     const int xStride= 16;
1542     int a,b,c,d,e;
1543
1544     a= block[stride*0 + xStride*0];
1545     b= block[stride*0 + xStride*1];
1546     c= block[stride*1 + xStride*0];
1547     d= block[stride*1 + xStride*1];
1548
1549     e= a-b;
1550     a= a+b;
1551     b= c-d;
1552     c= c+d;
1553
1554     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1555     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1556     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1557     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1558 }
1559
1560 #if 0
1561 static void chroma_dc_dct_c(DCTELEM *block){
1562     const int stride= 16*2;
1563     const int xStride= 16;
1564     int a,b,c,d,e;
1565
1566     a= block[stride*0 + xStride*0];
1567     b= block[stride*0 + xStride*1];
1568     c= block[stride*1 + xStride*0];
1569     d= block[stride*1 + xStride*1];
1570
1571     e= a-b;
1572     a= a+b;
1573     b= c-d;
1574     c= c+d;
1575
1576     block[stride*0 + xStride*0]= (a+c);
1577     block[stride*0 + xStride*1]= (e+b);
1578     block[stride*1 + xStride*0]= (a-c);
1579     block[stride*1 + xStride*1]= (e-b);
1580 }
1581 #endif
1582
1583 /**
1584  * gets the chroma qp.
1585  */
1586 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1587     return h->pps.chroma_qp_table[t][qscale];
1588 }
1589
1590 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1591                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1592                            int src_x_offset, int src_y_offset,
1593                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1594     MpegEncContext * const s = &h->s;
1595     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1596     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1597     const int luma_xy= (mx&3) + ((my&3)<<2);
1598     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1599     uint8_t * src_cb, * src_cr;
1600     int extra_width= h->emu_edge_width;
1601     int extra_height= h->emu_edge_height;
1602     int emu=0;
1603     const int full_mx= mx>>2;
1604     const int full_my= my>>2;
1605     const int pic_width  = 16*s->mb_width;
1606     const int pic_height = 16*s->mb_height >> MB_FIELD;
1607
1608     if(mx&7) extra_width -= 3;
1609     if(my&7) extra_height -= 3;
1610
1611     if(   full_mx < 0-extra_width
1612        || full_my < 0-extra_height
1613        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1614        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1615         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1616             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1617         emu=1;
1618     }
1619
1620     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1621     if(!square){
1622         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1623     }
1624
1625     if(ENABLE_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1626
1627     if(MB_FIELD){
1628         // chroma offset when predicting from a field of opposite parity
1629         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1630         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1631     }
1632     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1633     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1634
1635     if(emu){
1636         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1637             src_cb= s->edge_emu_buffer;
1638     }
1639     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1640
1641     if(emu){
1642         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1643             src_cr= s->edge_emu_buffer;
1644     }
1645     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1646 }
1647
1648 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1649                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1650                            int x_offset, int y_offset,
1651                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1652                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1653                            int list0, int list1){
1654     MpegEncContext * const s = &h->s;
1655     qpel_mc_func *qpix_op=  qpix_put;
1656     h264_chroma_mc_func chroma_op= chroma_put;
1657
1658     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1659     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1660     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1661     x_offset += 8*s->mb_x;
1662     y_offset += 8*(s->mb_y >> MB_FIELD);
1663
1664     if(list0){
1665         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1666         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1667                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1668                            qpix_op, chroma_op);
1669
1670         qpix_op=  qpix_avg;
1671         chroma_op= chroma_avg;
1672     }
1673
1674     if(list1){
1675         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1676         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1677                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1678                            qpix_op, chroma_op);
1679     }
1680 }
1681
1682 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1683                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1684                            int x_offset, int y_offset,
1685                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1686                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1687                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1688                            int list0, int list1){
1689     MpegEncContext * const s = &h->s;
1690
1691     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1692     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1693     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1694     x_offset += 8*s->mb_x;
1695     y_offset += 8*(s->mb_y >> MB_FIELD);
1696
1697     if(list0 && list1){
1698         /* don't optimize for luma-only case, since B-frames usually
1699          * use implicit weights => chroma too. */
1700         uint8_t *tmp_cb = s->obmc_scratchpad;
1701         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1702         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1703         int refn0 = h->ref_cache[0][ scan8[n] ];
1704         int refn1 = h->ref_cache[1][ scan8[n] ];
1705
1706         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1707                     dest_y, dest_cb, dest_cr,
1708                     x_offset, y_offset, qpix_put, chroma_put);
1709         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1710                     tmp_y, tmp_cb, tmp_cr,
1711                     x_offset, y_offset, qpix_put, chroma_put);
1712
1713         if(h->use_weight == 2){
1714             int weight0 = h->implicit_weight[refn0][refn1];
1715             int weight1 = 64 - weight0;
1716             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1717             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1718             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1719         }else{
1720             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1721                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1722                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1723             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1724                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1725                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1726             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1727                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1728                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1729         }
1730     }else{
1731         int list = list1 ? 1 : 0;
1732         int refn = h->ref_cache[list][ scan8[n] ];
1733         Picture *ref= &h->ref_list[list][refn];
1734         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1735                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1736                     qpix_put, chroma_put);
1737
1738         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1739                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1740         if(h->use_weight_chroma){
1741             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1742                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1743             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1744                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1745         }
1746     }
1747 }
1748
1749 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1750                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1751                            int x_offset, int y_offset,
1752                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1753                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1754                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1755                            int list0, int list1){
1756     if((h->use_weight==2 && list0 && list1
1757         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1758        || h->use_weight==1)
1759         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1760                          x_offset, y_offset, qpix_put, chroma_put,
1761                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1762     else
1763         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1764                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1765 }
1766
1767 static inline void prefetch_motion(H264Context *h, int list){
1768     /* fetch pixels for estimated mv 4 macroblocks ahead
1769      * optimized for 64byte cache lines */
1770     MpegEncContext * const s = &h->s;
1771     const int refn = h->ref_cache[list][scan8[0]];
1772     if(refn >= 0){
1773         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1774         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1775         uint8_t **src= h->ref_list[list][refn].data;
1776         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1777         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1778         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1779         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1780     }
1781 }
1782
1783 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1784                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1785                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1786                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1787     MpegEncContext * const s = &h->s;
1788     const int mb_xy= h->mb_xy;
1789     const int mb_type= s->current_picture.mb_type[mb_xy];
1790
1791     assert(IS_INTER(mb_type));
1792
1793     prefetch_motion(h, 0);
1794
1795     if(IS_16X16(mb_type)){
1796         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1797                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1798                 &weight_op[0], &weight_avg[0],
1799                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1800     }else if(IS_16X8(mb_type)){
1801         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1802                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1803                 &weight_op[1], &weight_avg[1],
1804                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1805         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1806                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1807                 &weight_op[1], &weight_avg[1],
1808                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1809     }else if(IS_8X16(mb_type)){
1810         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1811                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1812                 &weight_op[2], &weight_avg[2],
1813                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1814         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1815                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1816                 &weight_op[2], &weight_avg[2],
1817                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1818     }else{
1819         int i;
1820
1821         assert(IS_8X8(mb_type));
1822
1823         for(i=0; i<4; i++){
1824             const int sub_mb_type= h->sub_mb_type[i];
1825             const int n= 4*i;
1826             int x_offset= (i&1)<<2;
1827             int y_offset= (i&2)<<1;
1828
1829             if(IS_SUB_8X8(sub_mb_type)){
1830                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1831                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1832                     &weight_op[3], &weight_avg[3],
1833                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1834             }else if(IS_SUB_8X4(sub_mb_type)){
1835                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1836                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1837                     &weight_op[4], &weight_avg[4],
1838                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1839                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1840                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1841                     &weight_op[4], &weight_avg[4],
1842                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1843             }else if(IS_SUB_4X8(sub_mb_type)){
1844                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1845                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1846                     &weight_op[5], &weight_avg[5],
1847                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1848                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1849                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1850                     &weight_op[5], &weight_avg[5],
1851                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1852             }else{
1853                 int j;
1854                 assert(IS_SUB_4X4(sub_mb_type));
1855                 for(j=0; j<4; j++){
1856                     int sub_x_offset= x_offset + 2*(j&1);
1857                     int sub_y_offset= y_offset +   (j&2);
1858                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1859                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1860                         &weight_op[6], &weight_avg[6],
1861                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1862                 }
1863             }
1864         }
1865     }
1866
1867     prefetch_motion(h, 1);
1868 }
1869
1870 static av_cold void decode_init_vlc(void){
1871     static int done = 0;
1872
1873     if (!done) {
1874         int i;
1875         int offset;
1876         done = 1;
1877
1878         chroma_dc_coeff_token_vlc.table = chroma_dc_coeff_token_vlc_table;
1879         chroma_dc_coeff_token_vlc.table_allocated = chroma_dc_coeff_token_vlc_table_size;
1880         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1881                  &chroma_dc_coeff_token_len [0], 1, 1,
1882                  &chroma_dc_coeff_token_bits[0], 1, 1,
1883                  INIT_VLC_USE_NEW_STATIC);
1884
1885         offset = 0;
1886         for(i=0; i<4; i++){
1887             coeff_token_vlc[i].table = coeff_token_vlc_tables+offset;
1888             coeff_token_vlc[i].table_allocated = coeff_token_vlc_tables_size[i];
1889             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1890                      &coeff_token_len [i][0], 1, 1,
1891                      &coeff_token_bits[i][0], 1, 1,
1892                      INIT_VLC_USE_NEW_STATIC);
1893             offset += coeff_token_vlc_tables_size[i];
1894         }
1895         /*
1896          * This is a one time safety check to make sure that
1897          * the packed static coeff_token_vlc table sizes
1898          * were initialized correctly.
1899          */
1900         assert(offset == FF_ARRAY_ELEMS(coeff_token_vlc_tables));
1901
1902         for(i=0; i<3; i++){
1903             chroma_dc_total_zeros_vlc[i].table = chroma_dc_total_zeros_vlc_tables[i];
1904             chroma_dc_total_zeros_vlc[i].table_allocated = chroma_dc_total_zeros_vlc_tables_size;
1905             init_vlc(&chroma_dc_total_zeros_vlc[i],
1906                      CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1907                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1908                      &chroma_dc_total_zeros_bits[i][0], 1, 1,
1909                      INIT_VLC_USE_NEW_STATIC);
1910         }
1911         for(i=0; i<15; i++){
1912             total_zeros_vlc[i].table = total_zeros_vlc_tables[i];
1913             total_zeros_vlc[i].table_allocated = total_zeros_vlc_tables_size;
1914             init_vlc(&total_zeros_vlc[i],
1915                      TOTAL_ZEROS_VLC_BITS, 16,
1916                      &total_zeros_len [i][0], 1, 1,
1917                      &total_zeros_bits[i][0], 1, 1,
1918                      INIT_VLC_USE_NEW_STATIC);
1919         }
1920
1921         for(i=0; i<6; i++){
1922             run_vlc[i].table = run_vlc_tables[i];
1923             run_vlc[i].table_allocated = run_vlc_tables_size;
1924             init_vlc(&run_vlc[i],
1925                      RUN_VLC_BITS, 7,
1926                      &run_len [i][0], 1, 1,
1927                      &run_bits[i][0], 1, 1,
1928                      INIT_VLC_USE_NEW_STATIC);
1929         }
1930         run7_vlc.table = run7_vlc_table,
1931         run7_vlc.table_allocated = run7_vlc_table_size;
1932         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
1933                  &run_len [6][0], 1, 1,
1934                  &run_bits[6][0], 1, 1,
1935                  INIT_VLC_USE_NEW_STATIC);
1936     }
1937 }
1938
1939 static void free_tables(H264Context *h){
1940     int i;
1941     H264Context *hx;
1942     av_freep(&h->intra4x4_pred_mode);
1943     av_freep(&h->chroma_pred_mode_table);
1944     av_freep(&h->cbp_table);
1945     av_freep(&h->mvd_table[0]);
1946     av_freep(&h->mvd_table[1]);
1947     av_freep(&h->direct_table);
1948     av_freep(&h->non_zero_count);
1949     av_freep(&h->slice_table_base);
1950     h->slice_table= NULL;
1951
1952     av_freep(&h->mb2b_xy);
1953     av_freep(&h->mb2b8_xy);
1954
1955     for(i = 0; i < h->s.avctx->thread_count; i++) {
1956         hx = h->thread_context[i];
1957         if(!hx) continue;
1958         av_freep(&hx->top_borders[1]);
1959         av_freep(&hx->top_borders[0]);
1960         av_freep(&hx->s.obmc_scratchpad);
1961     }
1962 }
1963
1964 static void init_dequant8_coeff_table(H264Context *h){
1965     int i,q,x;
1966     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
1967     h->dequant8_coeff[0] = h->dequant8_buffer[0];
1968     h->dequant8_coeff[1] = h->dequant8_buffer[1];
1969
1970     for(i=0; i<2; i++ ){
1971         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
1972             h->dequant8_coeff[1] = h->dequant8_buffer[0];
1973             break;
1974         }
1975
1976         for(q=0; q<52; q++){
1977             int shift = div6[q];
1978             int idx = rem6[q];
1979             for(x=0; x<64; x++)
1980                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
1981                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
1982                     h->pps.scaling_matrix8[i][x]) << shift;
1983         }
1984     }
1985 }
1986
1987 static void init_dequant4_coeff_table(H264Context *h){
1988     int i,j,q,x;
1989     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
1990     for(i=0; i<6; i++ ){
1991         h->dequant4_coeff[i] = h->dequant4_buffer[i];
1992         for(j=0; j<i; j++){
1993             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
1994                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
1995                 break;
1996             }
1997         }
1998         if(j<i)
1999             continue;
2000
2001         for(q=0; q<52; q++){
2002             int shift = div6[q] + 2;
2003             int idx = rem6[q];
2004             for(x=0; x<16; x++)
2005                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2006                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2007                     h->pps.scaling_matrix4[i][x]) << shift;
2008         }
2009     }
2010 }
2011
2012 static void init_dequant_tables(H264Context *h){
2013     int i,x;
2014     init_dequant4_coeff_table(h);
2015     if(h->pps.transform_8x8_mode)
2016         init_dequant8_coeff_table(h);
2017     if(h->sps.transform_bypass){
2018         for(i=0; i<6; i++)
2019             for(x=0; x<16; x++)
2020                 h->dequant4_coeff[i][0][x] = 1<<6;
2021         if(h->pps.transform_8x8_mode)
2022             for(i=0; i<2; i++)
2023                 for(x=0; x<64; x++)
2024                     h->dequant8_coeff[i][0][x] = 1<<6;
2025     }
2026 }
2027
2028
2029 /**
2030  * allocates tables.
2031  * needs width/height
2032  */
2033 static int alloc_tables(H264Context *h){
2034     MpegEncContext * const s = &h->s;
2035     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2036     int x,y;
2037
2038     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2039
2040     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2041     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(*h->slice_table_base))
2042     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2043
2044     CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2045     CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2046     CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2047     CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2048
2049     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(*h->slice_table_base));
2050     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2051
2052     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2053     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2054     for(y=0; y<s->mb_height; y++){
2055         for(x=0; x<s->mb_width; x++){
2056             const int mb_xy= x + y*s->mb_stride;
2057             const int b_xy = 4*x + 4*y*h->b_stride;
2058             const int b8_xy= 2*x + 2*y*h->b8_stride;
2059
2060             h->mb2b_xy [mb_xy]= b_xy;
2061             h->mb2b8_xy[mb_xy]= b8_xy;
2062         }
2063     }
2064
2065     s->obmc_scratchpad = NULL;
2066
2067     if(!h->dequant4_coeff[0])
2068         init_dequant_tables(h);
2069
2070     return 0;
2071 fail:
2072     free_tables(h);
2073     return -1;
2074 }
2075
2076 /**
2077  * Mimic alloc_tables(), but for every context thread.
2078  */
2079 static void clone_tables(H264Context *dst, H264Context *src){
2080     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2081     dst->non_zero_count           = src->non_zero_count;
2082     dst->slice_table              = src->slice_table;
2083     dst->cbp_table                = src->cbp_table;
2084     dst->mb2b_xy                  = src->mb2b_xy;
2085     dst->mb2b8_xy                 = src->mb2b8_xy;
2086     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2087     dst->mvd_table[0]             = src->mvd_table[0];
2088     dst->mvd_table[1]             = src->mvd_table[1];
2089     dst->direct_table             = src->direct_table;
2090
2091     dst->s.obmc_scratchpad = NULL;
2092     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2093 }
2094
2095 /**
2096  * Init context
2097  * Allocate buffers which are not shared amongst multiple threads.
2098  */
2099 static int context_init(H264Context *h){
2100     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2101     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2102
2103     return 0;
2104 fail:
2105     return -1; // free_tables will clean up for us
2106 }
2107
2108 static av_cold void common_init(H264Context *h){
2109     MpegEncContext * const s = &h->s;
2110
2111     s->width = s->avctx->width;
2112     s->height = s->avctx->height;
2113     s->codec_id= s->avctx->codec->id;
2114
2115     ff_h264_pred_init(&h->hpc, s->codec_id);
2116
2117     h->dequant_coeff_pps= -1;
2118     s->unrestricted_mv=1;
2119     s->decode=1; //FIXME
2120
2121     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2122     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2123 }
2124
2125 static av_cold int decode_init(AVCodecContext *avctx){
2126     H264Context *h= avctx->priv_data;
2127     MpegEncContext * const s = &h->s;
2128
2129     MPV_decode_defaults(s);
2130
2131     s->avctx = avctx;
2132     common_init(h);
2133
2134     s->out_format = FMT_H264;
2135     s->workaround_bugs= avctx->workaround_bugs;
2136
2137     // set defaults
2138 //    s->decode_mb= ff_h263_decode_mb;
2139     s->quarter_sample = 1;
2140     s->low_delay= 1;
2141
2142     if(avctx->codec_id == CODEC_ID_SVQ3)
2143         avctx->pix_fmt= PIX_FMT_YUVJ420P;
2144     else
2145         avctx->pix_fmt= PIX_FMT_YUV420P;
2146
2147     decode_init_vlc();
2148
2149     if(avctx->extradata_size > 0 && avctx->extradata &&
2150        *(char *)avctx->extradata == 1){
2151         h->is_avc = 1;
2152         h->got_avcC = 0;
2153     } else {
2154         h->is_avc = 0;
2155     }
2156
2157     h->thread_context[0] = h;
2158     h->outputed_poc = INT_MIN;
2159     h->prev_poc_msb= 1<<16;
2160     return 0;
2161 }
2162
2163 static int frame_start(H264Context *h){
2164     MpegEncContext * const s = &h->s;
2165     int i;
2166
2167     if(MPV_frame_start(s, s->avctx) < 0)
2168         return -1;
2169     ff_er_frame_start(s);
2170     /*
2171      * MPV_frame_start uses pict_type to derive key_frame.
2172      * This is incorrect for H.264; IDR markings must be used.
2173      * Zero here; IDR markings per slice in frame or fields are ORed in later.
2174      * See decode_nal_units().
2175      */
2176     s->current_picture_ptr->key_frame= 0;
2177
2178     assert(s->linesize && s->uvlinesize);
2179
2180     for(i=0; i<16; i++){
2181         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2182         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2183     }
2184     for(i=0; i<4; i++){
2185         h->block_offset[16+i]=
2186         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2187         h->block_offset[24+16+i]=
2188         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2189     }
2190
2191     /* can't be in alloc_tables because linesize isn't known there.
2192      * FIXME: redo bipred weight to not require extra buffer? */
2193     for(i = 0; i < s->avctx->thread_count; i++)
2194         if(!h->thread_context[i]->s.obmc_scratchpad)
2195             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2196
2197     /* some macroblocks will be accessed before they're available */
2198     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2199         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(*h->slice_table));
2200
2201 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2202
2203     // We mark the current picture as non-reference after allocating it, so
2204     // that if we break out due to an error it can be released automatically
2205     // in the next MPV_frame_start().
2206     // SVQ3 as well as most other codecs have only last/next/current and thus
2207     // get released even with set reference, besides SVQ3 and others do not
2208     // mark frames as reference later "naturally".
2209     if(s->codec_id != CODEC_ID_SVQ3)
2210         s->current_picture_ptr->reference= 0;
2211
2212     s->current_picture_ptr->field_poc[0]=
2213     s->current_picture_ptr->field_poc[1]= INT_MAX;
2214     assert(s->current_picture_ptr->long_ref==0);
2215
2216     return 0;
2217 }
2218
2219 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2220     MpegEncContext * const s = &h->s;
2221     int i;
2222     int step    = 1;
2223     int offset  = 1;
2224     int uvoffset= 1;
2225     int top_idx = 1;
2226     int skiplast= 0;
2227
2228     src_y  -=   linesize;
2229     src_cb -= uvlinesize;
2230     src_cr -= uvlinesize;
2231
2232     if(!simple && FRAME_MBAFF){
2233         if(s->mb_y&1){
2234             offset  = MB_MBAFF ? 1 : 17;
2235             uvoffset= MB_MBAFF ? 1 : 9;
2236             if(!MB_MBAFF){
2237                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 0)= *(uint64_t*)(src_y +  15*linesize);
2238                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 8)= *(uint64_t*)(src_y +8+15*linesize);
2239                 if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2240                     *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+7*uvlinesize);
2241                     *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+7*uvlinesize);
2242                 }
2243             }
2244         }else{
2245             if(!MB_MBAFF){
2246                 h->left_border[0]= h->top_borders[0][s->mb_x][15];
2247                 if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2248                     h->left_border[34   ]= h->top_borders[0][s->mb_x][16+7  ];
2249                     h->left_border[34+18]= h->top_borders[0][s->mb_x][16+8+7];
2250                 }
2251                 skiplast= 1;
2252             }
2253             offset  =
2254             uvoffset=
2255             top_idx = MB_MBAFF ? 0 : 1;
2256         }
2257         step= MB_MBAFF ? 2 : 1;
2258     }
2259
2260     // There are two lines saved, the line above the the top macroblock of a pair,
2261     // and the line above the bottom macroblock
2262     h->left_border[offset]= h->top_borders[top_idx][s->mb_x][15];
2263     for(i=1; i<17 - skiplast; i++){
2264         h->left_border[offset+i*step]= src_y[15+i*  linesize];
2265     }
2266
2267     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2268     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2269
2270     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2271         h->left_border[uvoffset+34   ]= h->top_borders[top_idx][s->mb_x][16+7];
2272         h->left_border[uvoffset+34+18]= h->top_borders[top_idx][s->mb_x][24+7];
2273         for(i=1; i<9 - skiplast; i++){
2274             h->left_border[uvoffset+34   +i*step]= src_cb[7+i*uvlinesize];
2275             h->left_border[uvoffset+34+18+i*step]= src_cr[7+i*uvlinesize];
2276         }
2277         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2278         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2279     }
2280 }
2281
2282 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2283     MpegEncContext * const s = &h->s;
2284     int temp8, i;
2285     uint64_t temp64;
2286     int deblock_left;
2287     int deblock_top;
2288     int mb_xy;
2289     int step    = 1;
2290     int offset  = 1;
2291     int uvoffset= 1;
2292     int top_idx = 1;
2293
2294     if(!simple && FRAME_MBAFF){
2295         if(s->mb_y&1){
2296             offset  = MB_MBAFF ? 1 : 17;
2297             uvoffset= MB_MBAFF ? 1 : 9;
2298         }else{
2299             offset  =
2300             uvoffset=
2301             top_idx = MB_MBAFF ? 0 : 1;
2302         }
2303         step= MB_MBAFF ? 2 : 1;
2304     }
2305
2306     if(h->deblocking_filter == 2) {
2307         mb_xy = h->mb_xy;
2308         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2309         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2310     } else {
2311         deblock_left = (s->mb_x > 0);
2312         deblock_top =  (s->mb_y > !!MB_FIELD);
2313     }
2314
2315     src_y  -=   linesize + 1;
2316     src_cb -= uvlinesize + 1;
2317     src_cr -= uvlinesize + 1;
2318
2319 #define XCHG(a,b,t,xchg)\
2320 t= a;\
2321 if(xchg)\
2322     a= b;\
2323 b= t;
2324
2325     if(deblock_left){
2326         for(i = !deblock_top; i<16; i++){
2327             XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, xchg);
2328         }
2329         XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, 1);
2330     }
2331
2332     if(deblock_top){
2333         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2334         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2335         if(s->mb_x+1 < s->mb_width){
2336             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2337         }
2338     }
2339
2340     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2341         if(deblock_left){
2342             for(i = !deblock_top; i<8; i++){
2343                 XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, xchg);
2344                 XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, xchg);
2345             }
2346             XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, 1);
2347             XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, 1);
2348         }
2349         if(deblock_top){
2350             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2351             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2352         }
2353     }
2354 }
2355
2356 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2357     MpegEncContext * const s = &h->s;
2358     const int mb_x= s->mb_x;
2359     const int mb_y= s->mb_y;
2360     const int mb_xy= h->mb_xy;
2361     const int mb_type= s->current_picture.mb_type[mb_xy];
2362     uint8_t  *dest_y, *dest_cb, *dest_cr;
2363     int linesize, uvlinesize /*dct_offset*/;
2364     int i;
2365     int *block_offset = &h->block_offset[0];
2366     const int transform_bypass = !simple && (s->qscale == 0 && h->sps.transform_bypass);
2367     const int is_h264 = simple || s->codec_id == CODEC_ID_H264;
2368     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2369     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2370
2371     dest_y  = s->current_picture.data[0] + (mb_x + mb_y * s->linesize  ) * 16;
2372     dest_cb = s->current_picture.data[1] + (mb_x + mb_y * s->uvlinesize) * 8;
2373     dest_cr = s->current_picture.data[2] + (mb_x + mb_y * s->uvlinesize) * 8;
2374
2375     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2376     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2377
2378     if (!simple && MB_FIELD) {
2379         linesize   = h->mb_linesize   = s->linesize * 2;
2380         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2381         block_offset = &h->block_offset[24];
2382         if(mb_y&1){ //FIXME move out of this function?
2383             dest_y -= s->linesize*15;
2384             dest_cb-= s->uvlinesize*7;
2385             dest_cr-= s->uvlinesize*7;
2386         }
2387         if(FRAME_MBAFF) {
2388             int list;
2389             for(list=0; list<h->list_count; list++){
2390                 if(!USES_LIST(mb_type, list))
2391                     continue;
2392                 if(IS_16X16(mb_type)){
2393                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2394                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2395                 }else{
2396                     for(i=0; i<16; i+=4){
2397                         int ref = h->ref_cache[list][scan8[i]];
2398                         if(ref >= 0)
2399                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2400                     }
2401                 }
2402             }
2403         }
2404     } else {
2405         linesize   = h->mb_linesize   = s->linesize;
2406         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2407 //        dct_offset = s->linesize * 16;
2408     }
2409
2410     if (!simple && IS_INTRA_PCM(mb_type)) {
2411         for (i=0; i<16; i++) {
2412             memcpy(dest_y + i*  linesize, h->mb       + i*8, 16);
2413         }
2414         for (i=0; i<8; i++) {
2415             memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
2416             memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
2417         }
2418     } else {
2419         if(IS_INTRA(mb_type)){
2420             if(h->deblocking_filter)
2421                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2422
2423             if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2424                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2425                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2426             }
2427
2428             if(IS_INTRA4x4(mb_type)){
2429                 if(simple || !s->encoding){
2430                     if(IS_8x8DCT(mb_type)){
2431                         if(transform_bypass){
2432                             idct_dc_add =
2433                             idct_add    = s->dsp.add_pixels8;
2434                         }else{
2435                             idct_dc_add = s->dsp.h264_idct8_dc_add;
2436                             idct_add    = s->dsp.h264_idct8_add;
2437                         }
2438                         for(i=0; i<16; i+=4){
2439                             uint8_t * const ptr= dest_y + block_offset[i];
2440                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2441                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2442                                 h->hpc.pred8x8l_add[dir](ptr, h->mb + i*16, linesize);
2443                             }else{
2444                                 const int nnz = h->non_zero_count_cache[ scan8[i] ];
2445                                 h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2446                                                             (h->topright_samples_available<<i)&0x4000, linesize);
2447                                 if(nnz){
2448                                     if(nnz == 1 && h->mb[i*16])
2449                                         idct_dc_add(ptr, h->mb + i*16, linesize);
2450                                     else
2451                                         idct_add   (ptr, h->mb + i*16, linesize);
2452                                 }
2453                             }
2454                         }
2455                     }else{
2456                         if(transform_bypass){
2457                             idct_dc_add =
2458                             idct_add    = s->dsp.add_pixels4;
2459                         }else{
2460                             idct_dc_add = s->dsp.h264_idct_dc_add;
2461                             idct_add    = s->dsp.h264_idct_add;
2462                         }
2463                         for(i=0; i<16; i++){
2464                             uint8_t * const ptr= dest_y + block_offset[i];
2465                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2466
2467                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2468                                 h->hpc.pred4x4_add[dir](ptr, h->mb + i*16, linesize);
2469                             }else{
2470                                 uint8_t *topright;
2471                                 int nnz, tr;
2472                                 if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2473                                     const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2474                                     assert(mb_y || linesize <= block_offset[i]);
2475                                     if(!topright_avail){
2476                                         tr= ptr[3 - linesize]*0x01010101;
2477                                         topright= (uint8_t*) &tr;
2478                                     }else
2479                                         topright= ptr + 4 - linesize;
2480                                 }else
2481                                     topright= NULL;
2482
2483                                 h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2484                                 nnz = h->non_zero_count_cache[ scan8[i] ];
2485                                 if(nnz){
2486                                     if(is_h264){
2487                                         if(nnz == 1 && h->mb[i*16])
2488                                             idct_dc_add(ptr, h->mb + i*16, linesize);
2489                                         else
2490                                             idct_add   (ptr, h->mb + i*16, linesize);
2491                                     }else
2492                                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2493                                 }
2494                             }
2495                         }
2496                     }
2497                 }
2498             }else{
2499                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2500                 if(is_h264){
2501                     if(!transform_bypass)
2502                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2503                 }else
2504                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2505             }
2506             if(h->deblocking_filter)
2507                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2508         }else if(is_h264){
2509             hl_motion(h, dest_y, dest_cb, dest_cr,
2510                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2511                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2512                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2513         }
2514
2515
2516         if(!IS_INTRA4x4(mb_type)){
2517             if(is_h264){
2518                 if(IS_INTRA16x16(mb_type)){
2519                     if(transform_bypass){
2520                         if(h->sps.profile_idc==244 && (h->intra16x16_pred_mode==VERT_PRED8x8 || h->intra16x16_pred_mode==HOR_PRED8x8)){
2521                             h->hpc.pred16x16_add[h->intra16x16_pred_mode](dest_y, block_offset, h->mb, linesize);
2522                         }else{
2523                             for(i=0; i<16; i++){
2524                                 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2525                                     s->dsp.add_pixels4(dest_y + block_offset[i], h->mb + i*16, linesize);
2526                             }
2527                         }
2528                     }else{
2529                          s->dsp.h264_idct_add16intra(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2530                     }
2531                 }else if(h->cbp&15){
2532                     if(transform_bypass){
2533                         const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2534                         idct_add= IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2535                         for(i=0; i<16; i+=di){
2536                             if(h->non_zero_count_cache[ scan8[i] ]){
2537                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2538                             }
2539                         }
2540                     }else{
2541                         if(IS_8x8DCT(mb_type)){
2542                             s->dsp.h264_idct8_add4(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2543                         }else{
2544                             s->dsp.h264_idct_add16(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2545                         }
2546                     }
2547                 }
2548             }else{
2549                 for(i=0; i<16; i++){
2550                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2551                         uint8_t * const ptr= dest_y + block_offset[i];
2552                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2553                     }
2554                 }
2555             }
2556         }
2557
2558         if((simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)) && (h->cbp&0x30)){
2559             uint8_t *dest[2] = {dest_cb, dest_cr};
2560             if(transform_bypass){
2561                 if(IS_INTRA(mb_type) && h->sps.profile_idc==244 && (h->chroma_pred_mode==VERT_PRED8x8 || h->chroma_pred_mode==HOR_PRED8x8)){
2562                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0], block_offset + 16, h->mb + 16*16, uvlinesize);
2563                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1], block_offset + 20, h->mb + 20*16, uvlinesize);
2564                 }else{
2565                     idct_add = s->dsp.add_pixels4;
2566                     for(i=16; i<16+8; i++){
2567                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2568                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2569                     }
2570                 }
2571             }else{
2572                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2573                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2574                 if(is_h264){
2575                     idct_add = s->dsp.h264_idct_add;
2576                     idct_dc_add = s->dsp.h264_idct_dc_add;
2577                     for(i=16; i<16+8; i++){
2578                         if(h->non_zero_count_cache[ scan8[i] ])
2579                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2580                         else if(h->mb[i*16])
2581                             idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2582                     }
2583                 }else{
2584                     for(i=16; i<16+8; i++){
2585                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2586                             uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2587                             svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2588                         }
2589                     }
2590                 }
2591             }
2592         }
2593     }
2594     if(h->deblocking_filter) {
2595         backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2596         fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2597         h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2598         h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2599         if (!simple && FRAME_MBAFF) {
2600             filter_mb     (h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2601         } else {
2602             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2603         }
2604     }
2605 }
2606
2607 /**
2608  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2609  */
2610 static void hl_decode_mb_simple(H264Context *h){
2611     hl_decode_mb_internal(h, 1);
2612 }
2613
2614 /**
2615  * Process a macroblock; this handles edge cases, such as interlacing.
2616  */
2617 static void av_noinline hl_decode_mb_complex(H264Context *h){
2618     hl_decode_mb_internal(h, 0);
2619 }
2620
2621 static void hl_decode_mb(H264Context *h){
2622     MpegEncContext * const s = &h->s;
2623     const int mb_xy= h->mb_xy;
2624     const int mb_type= s->current_picture.mb_type[mb_xy];
2625     int is_complex = ENABLE_SMALL || h->is_complex || IS_INTRA_PCM(mb_type) || s->qscale == 0;
2626
2627     if(ENABLE_H264_ENCODER && !s->decode)
2628         return;
2629
2630     if (is_complex)
2631         hl_decode_mb_complex(h);
2632     else hl_decode_mb_simple(h);
2633 }
2634
2635 static void pic_as_field(Picture *pic, const int parity){
2636     int i;
2637     for (i = 0; i < 4; ++i) {
2638         if (parity == PICT_BOTTOM_FIELD)
2639             pic->data[i] += pic->linesize[i];
2640         pic->reference = parity;
2641         pic->linesize[i] *= 2;
2642     }
2643     pic->poc= pic->field_poc[parity == PICT_BOTTOM_FIELD];
2644 }
2645
2646 static int split_field_copy(Picture *dest, Picture *src,
2647                             int parity, int id_add){
2648     int match = !!(src->reference & parity);
2649
2650     if (match) {
2651         *dest = *src;
2652         if(parity != PICT_FRAME){
2653             pic_as_field(dest, parity);
2654             dest->pic_id *= 2;
2655             dest->pic_id += id_add;
2656         }
2657     }
2658
2659     return match;
2660 }
2661
2662 static int build_def_list(Picture *def, Picture **in, int len, int is_long, int sel){
2663     int i[2]={0};
2664     int index=0;
2665
2666     while(i[0]<len || i[1]<len){
2667         while(i[0]<len && !(in[ i[0] ] && (in[ i[0] ]->reference & sel)))
2668             i[0]++;
2669         while(i[1]<len && !(in[ i[1] ] && (in[ i[1] ]->reference & (sel^3))))
2670             i[1]++;
2671         if(i[0] < len){
2672             in[ i[0] ]->pic_id= is_long ? i[0] : in[ i[0] ]->frame_num;
2673             split_field_copy(&def[index++], in[ i[0]++ ], sel  , 1);
2674         }
2675         if(i[1] < len){
2676             in[ i[1] ]->pic_id= is_long ? i[1] : in[ i[1] ]->frame_num;
2677             split_field_copy(&def[index++], in[ i[1]++ ], sel^3, 0);
2678         }
2679     }
2680
2681     return index;
2682 }
2683
2684 static int add_sorted(Picture **sorted, Picture **src, int len, int limit, int dir){
2685     int i, best_poc;
2686     int out_i= 0;
2687
2688     for(;;){
2689         best_poc= dir ? INT_MIN : INT_MAX;
2690
2691         for(i=0; i<len; i++){
2692             const int poc= src[i]->poc;
2693             if(((poc > limit) ^ dir) && ((poc < best_poc) ^ dir)){
2694                 best_poc= poc;
2695                 sorted[out_i]= src[i];
2696             }
2697         }
2698         if(best_poc == (dir ? INT_MIN : INT_MAX))
2699             break;
2700         limit= sorted[out_i++]->poc - dir;
2701     }
2702     return out_i;
2703 }
2704
2705 /**
2706  * fills the default_ref_list.
2707  */
2708 static int fill_default_ref_list(H264Context *h){
2709     MpegEncContext * const s = &h->s;
2710     int i, len;
2711
2712     if(h->slice_type_nos==FF_B_TYPE){
2713         Picture *sorted[32];
2714         int cur_poc, list;
2715         int lens[2];
2716
2717         if(FIELD_PICTURE)
2718             cur_poc= s->current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
2719         else
2720             cur_poc= s->current_picture_ptr->poc;
2721
2722         for(list= 0; list<2; list++){
2723             len= add_sorted(sorted    , h->short_ref, h->short_ref_count, cur_poc, 1^list);
2724             len+=add_sorted(sorted+len, h->short_ref, h->short_ref_count, cur_poc, 0^list);
2725             assert(len<=32);
2726             len= build_def_list(h->default_ref_list[list]    , sorted     , len, 0, s->picture_structure);
2727             len+=build_def_list(h->default_ref_list[list]+len, h->long_ref, 16 , 1, s->picture_structure);
2728             assert(len<=32);
2729
2730             if(len < h->ref_count[list])
2731                 memset(&h->default_ref_list[list][len], 0, sizeof(Picture)*(h->ref_count[list] - len));
2732             lens[list]= len;
2733         }
2734
2735         if(lens[0] == lens[1] && lens[1] > 1){
2736             for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0] && i<lens[0]; i++);
2737             if(i == lens[0])
2738                 FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2739         }
2740     }else{
2741         len = build_def_list(h->default_ref_list[0]    , h->short_ref, h->short_ref_count, 0, s->picture_structure);
2742         len+= build_def_list(h->default_ref_list[0]+len, h-> long_ref, 16                , 1, s->picture_structure);
2743         assert(len <= 32);
2744         if(len < h->ref_count[0])
2745             memset(&h->default_ref_list[0][len], 0, sizeof(Picture)*(h->ref_count[0] - len));
2746     }
2747 #ifdef TRACE
2748     for (i=0; i<h->ref_count[0]; i++) {
2749         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2750     }
2751     if(h->slice_type_nos==FF_B_TYPE){
2752         for (i=0; i<h->ref_count[1]; i++) {
2753             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
2754         }
2755     }
2756 #endif
2757     return 0;
2758 }
2759
2760 static void print_short_term(H264Context *h);
2761 static void print_long_term(H264Context *h);
2762
2763 /**
2764  * Extract structure information about the picture described by pic_num in
2765  * the current decoding context (frame or field). Note that pic_num is
2766  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
2767  * @param pic_num picture number for which to extract structure information
2768  * @param structure one of PICT_XXX describing structure of picture
2769  *                      with pic_num
2770  * @return frame number (short term) or long term index of picture
2771  *         described by pic_num
2772  */
2773 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
2774     MpegEncContext * const s = &h->s;
2775
2776     *structure = s->picture_structure;
2777     if(FIELD_PICTURE){
2778         if (!(pic_num & 1))
2779             /* opposite field */
2780             *structure ^= PICT_FRAME;
2781         pic_num >>= 1;
2782     }
2783
2784     return pic_num;
2785 }
2786
2787 static int decode_ref_pic_list_reordering(H264Context *h){
2788     MpegEncContext * const s = &h->s;
2789     int list, index, pic_structure;
2790
2791     print_short_term(h);
2792     print_long_term(h);
2793
2794     for(list=0; list<h->list_count; list++){
2795         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
2796
2797         if(get_bits1(&s->gb)){
2798             int pred= h->curr_pic_num;
2799
2800             for(index=0; ; index++){
2801                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
2802                 unsigned int pic_id;
2803                 int i;
2804                 Picture *ref = NULL;
2805
2806                 if(reordering_of_pic_nums_idc==3)
2807                     break;
2808
2809                 if(index >= h->ref_count[list]){
2810                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
2811                     return -1;
2812                 }
2813
2814                 if(reordering_of_pic_nums_idc<3){
2815                     if(reordering_of_pic_nums_idc<2){
2816                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
2817                         int frame_num;
2818
2819                         if(abs_diff_pic_num > h->max_pic_num){
2820                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
2821                             return -1;
2822                         }
2823
2824                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
2825                         else                                pred+= abs_diff_pic_num;
2826                         pred &= h->max_pic_num - 1;
2827
2828                         frame_num = pic_num_extract(h, pred, &pic_structure);
2829
2830                         for(i= h->short_ref_count-1; i>=0; i--){
2831                             ref = h->short_ref[i];
2832                             assert(ref->reference);
2833                             assert(!ref->long_ref);
2834                             if(
2835                                    ref->frame_num == frame_num &&
2836                                    (ref->reference & pic_structure)
2837                               )
2838                                 break;
2839                         }
2840                         if(i>=0)
2841                             ref->pic_id= pred;
2842                     }else{
2843                         int long_idx;
2844                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
2845
2846                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
2847
2848                         if(long_idx>31){
2849                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
2850                             return -1;
2851                         }
2852                         ref = h->long_ref[long_idx];
2853                         assert(!(ref && !ref->reference));
2854                         if(ref && (ref->reference & pic_structure)){
2855                             ref->pic_id= pic_id;
2856                             assert(ref->long_ref);
2857                             i=0;
2858                         }else{
2859                             i=-1;
2860                         }
2861                     }
2862
2863                     if (i < 0) {
2864                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
2865                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
2866                     } else {
2867                         for(i=index; i+1<h->ref_count[list]; i++){
2868                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
2869                                 break;
2870                         }
2871                         for(; i > index; i--){
2872                             h->ref_list[list][i]= h->ref_list[list][i-1];
2873                         }
2874                         h->ref_list[list][index]= *ref;
2875                         if (FIELD_PICTURE){
2876                             pic_as_field(&h->ref_list[list][index], pic_structure);
2877                         }
2878                     }
2879                 }else{
2880                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
2881                     return -1;
2882                 }
2883             }
2884         }
2885     }
2886     for(list=0; list<h->list_count; list++){
2887         for(index= 0; index < h->ref_count[list]; index++){
2888             if(!h->ref_list[list][index].data[0]){
2889                 av_log(h->s.avctx, AV_LOG_ERROR, "Missing reference picture\n");
2890                 h->ref_list[list][index]= s->current_picture; //FIXME this is not a sensible solution
2891             }
2892         }
2893     }
2894
2895     return 0;
2896 }
2897
2898 static void fill_mbaff_ref_list(H264Context *h){
2899     int list, i, j;
2900     for(list=0; list<2; list++){ //FIXME try list_count
2901         for(i=0; i<h->ref_count[list]; i++){
2902             Picture *frame = &h->ref_list[list][i];
2903             Picture *field = &h->ref_list[list][16+2*i];
2904             field[0] = *frame;
2905             for(j=0; j<3; j++)
2906                 field[0].linesize[j] <<= 1;
2907             field[0].reference = PICT_TOP_FIELD;
2908             field[0].poc= field[0].field_poc[0];
2909             field[1] = field[0];
2910             for(j=0; j<3; j++)
2911                 field[1].data[j] += frame->linesize[j];
2912             field[1].reference = PICT_BOTTOM_FIELD;
2913             field[1].poc= field[1].field_poc[1];
2914
2915             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
2916             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
2917             for(j=0; j<2; j++){
2918                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
2919                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
2920             }
2921         }
2922     }
2923     for(j=0; j<h->ref_count[1]; j++){
2924         for(i=0; i<h->ref_count[0]; i++)
2925             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
2926         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
2927         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
2928     }
2929 }
2930
2931 static int pred_weight_table(H264Context *h){
2932     MpegEncContext * const s = &h->s;
2933     int list, i;
2934     int luma_def, chroma_def;
2935
2936     h->use_weight= 0;
2937     h->use_weight_chroma= 0;
2938     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
2939     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
2940     luma_def = 1<<h->luma_log2_weight_denom;
2941     chroma_def = 1<<h->chroma_log2_weight_denom;
2942
2943     for(list=0; list<2; list++){
2944         for(i=0; i<h->ref_count[list]; i++){
2945             int luma_weight_flag, chroma_weight_flag;
2946
2947             luma_weight_flag= get_bits1(&s->gb);
2948             if(luma_weight_flag){
2949                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
2950                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
2951                 if(   h->luma_weight[list][i] != luma_def
2952                    || h->luma_offset[list][i] != 0)
2953                     h->use_weight= 1;
2954             }else{
2955                 h->luma_weight[list][i]= luma_def;
2956                 h->luma_offset[list][i]= 0;
2957             }
2958
2959             if(CHROMA){
2960                 chroma_weight_flag= get_bits1(&s->gb);
2961                 if(chroma_weight_flag){
2962                     int j;
2963                     for(j=0; j<2; j++){
2964                         h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
2965                         h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
2966                         if(   h->chroma_weight[list][i][j] != chroma_def
2967                         || h->chroma_offset[list][i][j] != 0)
2968                             h->use_weight_chroma= 1;
2969                     }
2970                 }else{
2971                     int j;
2972                     for(j=0; j<2; j++){
2973                         h->chroma_weight[list][i][j]= chroma_def;
2974                         h->chroma_offset[list][i][j]= 0;
2975                     }
2976                 }
2977             }
2978         }
2979         if(h->slice_type_nos != FF_B_TYPE) break;
2980     }
2981     h->use_weight= h->use_weight || h->use_weight_chroma;
2982     return 0;
2983 }
2984
2985 static void implicit_weight_table(H264Context *h){
2986     MpegEncContext * const s = &h->s;
2987     int ref0, ref1;
2988     int cur_poc = s->current_picture_ptr->poc;
2989
2990     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
2991        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
2992         h->use_weight= 0;
2993         h->use_weight_chroma= 0;
2994         return;
2995     }
2996
2997     h->use_weight= 2;
2998     h->use_weight_chroma= 2;
2999     h->luma_log2_weight_denom= 5;
3000     h->chroma_log2_weight_denom= 5;
3001
3002     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3003         int poc0 = h->ref_list[0][ref0].poc;
3004         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3005             int poc1 = h->ref_list[1][ref1].poc;
3006             int td = av_clip(poc1 - poc0, -128, 127);
3007             if(td){
3008                 int tb = av_clip(cur_poc - poc0, -128, 127);
3009                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3010                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3011                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3012                     h->implicit_weight[ref0][ref1] = 32;
3013                 else
3014                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3015             }else
3016                 h->implicit_weight[ref0][ref1] = 32;
3017         }
3018     }
3019 }
3020
3021 /**
3022  * Mark a picture as no longer needed for reference. The refmask
3023  * argument allows unreferencing of individual fields or the whole frame.
3024  * If the picture becomes entirely unreferenced, but is being held for
3025  * display purposes, it is marked as such.
3026  * @param refmask mask of fields to unreference; the mask is bitwise
3027  *                anded with the reference marking of pic
3028  * @return non-zero if pic becomes entirely unreferenced (except possibly
3029  *         for display purposes) zero if one of the fields remains in
3030  *         reference
3031  */
3032 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3033     int i;
3034     if (pic->reference &= refmask) {
3035         return 0;
3036     } else {
3037         for(i = 0; h->delayed_pic[i]; i++)
3038             if(pic == h->delayed_pic[i]){
3039                 pic->reference=DELAYED_PIC_REF;
3040                 break;
3041             }
3042         return 1;
3043     }
3044 }
3045
3046 /**
3047  * instantaneous decoder refresh.
3048  */
3049 static void idr(H264Context *h){
3050     int i;
3051
3052     for(i=0; i<16; i++){
3053         remove_long(h, i, 0);
3054     }
3055     assert(h->long_ref_count==0);
3056
3057     for(i=0; i<h->short_ref_count; i++){
3058         unreference_pic(h, h->short_ref[i], 0);
3059         h->short_ref[i]= NULL;
3060     }
3061     h->short_ref_count=0;
3062     h->prev_frame_num= 0;
3063     h->prev_frame_num_offset= 0;
3064     h->prev_poc_msb=
3065     h->prev_poc_lsb= 0;
3066 }
3067
3068 /* forget old pics after a seek */
3069 static void flush_dpb(AVCodecContext *avctx){
3070     H264Context *h= avctx->priv_data;
3071     int i;
3072     for(i=0; i<MAX_DELAYED_PIC_COUNT; i++) {
3073         if(h->delayed_pic[i])
3074             h->delayed_pic[i]->reference= 0;
3075         h->delayed_pic[i]= NULL;
3076     }
3077     h->outputed_poc= INT_MIN;
3078     idr(h);
3079     if(h->s.current_picture_ptr)
3080         h->s.current_picture_ptr->reference= 0;
3081     h->s.first_field= 0;
3082     ff_mpeg_flush(avctx);
3083 }
3084
3085 /**
3086  * Find a Picture in the short term reference list by frame number.
3087  * @param frame_num frame number to search for
3088  * @param idx the index into h->short_ref where returned picture is found
3089  *            undefined if no picture found.
3090  * @return pointer to the found picture, or NULL if no pic with the provided
3091  *                 frame number is found
3092  */
3093 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3094     MpegEncContext * const s = &h->s;
3095     int i;
3096
3097     for(i=0; i<h->short_ref_count; i++){
3098         Picture *pic= h->short_ref[i];
3099         if(s->avctx->debug&FF_DEBUG_MMCO)
3100             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3101         if(pic->frame_num == frame_num) {
3102             *idx = i;
3103             return pic;
3104         }
3105     }
3106     return NULL;
3107 }
3108
3109 /**
3110  * Remove a picture from the short term reference list by its index in
3111  * that list.  This does no checking on the provided index; it is assumed
3112  * to be valid. Other list entries are shifted down.
3113  * @param i index into h->short_ref of picture to remove.
3114  */
3115 static void remove_short_at_index(H264Context *h, int i){
3116     assert(i >= 0 && i < h->short_ref_count);
3117     h->short_ref[i]= NULL;
3118     if (--h->short_ref_count)
3119         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3120 }
3121
3122 /**
3123  *
3124  * @return the removed picture or NULL if an error occurs
3125  */
3126 static Picture * remove_short(H264Context *h, int frame_num, int ref_mask){
3127     MpegEncContext * const s = &h->s;
3128     Picture *pic;
3129     int i;
3130
3131     if(s->avctx->debug&FF_DEBUG_MMCO)
3132         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3133
3134     pic = find_short(h, frame_num, &i);
3135     if (pic){
3136         if(unreference_pic(h, pic, ref_mask))
3137         remove_short_at_index(h, i);
3138     }
3139
3140     return pic;
3141 }
3142
3143 /**
3144  * Remove a picture from the long term reference list by its index in
3145  * that list.
3146  * @return the removed picture or NULL if an error occurs
3147  */
3148 static Picture * remove_long(H264Context *h, int i, int ref_mask){
3149     Picture *pic;
3150
3151     pic= h->long_ref[i];
3152     if (pic){
3153         if(unreference_pic(h, pic, ref_mask)){
3154             assert(h->long_ref[i]->long_ref == 1);
3155             h->long_ref[i]->long_ref= 0;
3156             h->long_ref[i]= NULL;
3157             h->long_ref_count--;
3158         }
3159     }
3160
3161     return pic;
3162 }
3163
3164 /**
3165  * print short term list
3166  */
3167 static void print_short_term(H264Context *h) {
3168     uint32_t i;
3169     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3170         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3171         for(i=0; i<h->short_ref_count; i++){
3172             Picture *pic= h->short_ref[i];
3173             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3174         }
3175     }
3176 }
3177
3178 /**
3179  * print long term list
3180  */
3181 static void print_long_term(H264Context *h) {
3182     uint32_t i;
3183     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3184         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3185         for(i = 0; i < 16; i++){
3186             Picture *pic= h->long_ref[i];
3187             if (pic) {
3188                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3189             }
3190         }
3191     }
3192 }
3193
3194 /**
3195  * Executes the reference picture marking (memory management control operations).
3196  */
3197 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3198     MpegEncContext * const s = &h->s;
3199     int i, j;
3200     int current_ref_assigned=0;
3201     Picture *pic;
3202
3203     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3204         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3205
3206     for(i=0; i<mmco_count; i++){
3207         int structure, frame_num;
3208         if(s->avctx->debug&FF_DEBUG_MMCO)
3209             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3210
3211         if(   mmco[i].opcode == MMCO_SHORT2UNUSED
3212            || mmco[i].opcode == MMCO_SHORT2LONG){
3213             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3214             pic = find_short(h, frame_num, &j);
3215             if(!pic){
3216                 if(mmco[i].opcode != MMCO_SHORT2LONG || !h->long_ref[mmco[i].long_arg]
3217                    || h->long_ref[mmco[i].long_arg]->frame_num != frame_num)
3218                 av_log(h->s.avctx, AV_LOG_ERROR, "mmco: unref short failure\n");
3219                 continue;
3220             }
3221         }
3222
3223         switch(mmco[i].opcode){
3224         case MMCO_SHORT2UNUSED:
3225             if(s->avctx->debug&FF_DEBUG_MMCO)
3226                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3227             remove_short(h, frame_num, structure ^ PICT_FRAME);
3228             break;
3229         case MMCO_SHORT2LONG:
3230                 if (h->long_ref[mmco[i].long_arg] != pic)
3231                     remove_long(h, mmco[i].long_arg, 0);
3232
3233                 remove_short_at_index(h, j);
3234                 h->long_ref[ mmco[i].long_arg ]= pic;
3235                 if (h->long_ref[ mmco[i].long_arg ]){
3236                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3237                     h->long_ref_count++;
3238                 }
3239             break;
3240         case MMCO_LONG2UNUSED:
3241             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3242             pic = h->long_ref[j];
3243             if (pic) {
3244                 remove_long(h, j, structure ^ PICT_FRAME);
3245             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3246                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3247             break;
3248         case MMCO_LONG:
3249                     // Comment below left from previous code as it is an interresting note.
3250                     /* First field in pair is in short term list or
3251                      * at a different long term index.
3252                      * This is not allowed; see 7.4.3.3, notes 2 and 3.
3253                      * Report the problem and keep the pair where it is,
3254                      * and mark this field valid.
3255                      */
3256
3257             if (h->long_ref[mmco[i].long_arg] != s->current_picture_ptr) {
3258                 remove_long(h, mmco[i].long_arg, 0);
3259
3260                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3261                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3262                 h->long_ref_count++;
3263             }
3264
3265             s->current_picture_ptr->reference |= s->picture_structure;
3266             current_ref_assigned=1;
3267             break;
3268         case MMCO_SET_MAX_LONG:
3269             assert(mmco[i].long_arg <= 16);
3270             // just remove the long term which index is greater than new max
3271             for(j = mmco[i].long_arg; j<16; j++){
3272                 remove_long(h, j, 0);
3273             }
3274             break;
3275         case MMCO_RESET:
3276             while(h->short_ref_count){
3277                 remove_short(h, h->short_ref[0]->frame_num, 0);
3278             }
3279             for(j = 0; j < 16; j++) {
3280                 remove_long(h, j, 0);
3281             }
3282             s->current_picture_ptr->poc=
3283             s->current_picture_ptr->field_poc[0]=
3284             s->current_picture_ptr->field_poc[1]=
3285             h->poc_lsb=
3286             h->poc_msb=
3287             h->frame_num=
3288             s->current_picture_ptr->frame_num= 0;
3289             break;
3290         default: assert(0);
3291         }
3292     }
3293
3294     if (!current_ref_assigned) {
3295         /* Second field of complementary field pair; the first field of
3296          * which is already referenced. If short referenced, it
3297          * should be first entry in short_ref. If not, it must exist
3298          * in long_ref; trying to put it on the short list here is an
3299          * error in the encoded bit stream (ref: 7.4.3.3, NOTE 2 and 3).
3300          */
3301         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3302             /* Just mark the second field valid */
3303             s->current_picture_ptr->reference = PICT_FRAME;
3304         } else if (s->current_picture_ptr->long_ref) {
3305             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3306                                              "assignment for second field "
3307                                              "in complementary field pair "
3308                                              "(first field is long term)\n");
3309         } else {
3310             pic= remove_short(h, s->current_picture_ptr->frame_num, 0);
3311             if(pic){
3312                 av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3313             }
3314
3315             if(h->short_ref_count)
3316                 memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3317
3318             h->short_ref[0]= s->current_picture_ptr;
3319             h->short_ref_count++;
3320             s->current_picture_ptr->reference |= s->picture_structure;
3321         }
3322     }
3323
3324     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3325
3326         /* We have too many reference frames, probably due to corrupted
3327          * stream. Need to discard one frame. Prevents overrun of the
3328          * short_ref and long_ref buffers.
3329          */
3330         av_log(h->s.avctx, AV_LOG_ERROR,
3331                "number of reference frames exceeds max (probably "
3332                "corrupt input), discarding one\n");
3333
3334         if (h->long_ref_count && !h->short_ref_count) {
3335             for (i = 0; i < 16; ++i)
3336                 if (h->long_ref[i])
3337                     break;
3338
3339             assert(i < 16);
3340             remove_long(h, i, 0);
3341         } else {
3342             pic = h->short_ref[h->short_ref_count - 1];
3343             remove_short(h, pic->frame_num, 0);
3344         }
3345     }
3346
3347     print_short_term(h);
3348     print_long_term(h);
3349     return 0;
3350 }
3351
3352 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3353     MpegEncContext * const s = &h->s;
3354     int i;
3355
3356     h->mmco_index= 0;
3357     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3358         s->broken_link= get_bits1(gb) -1;
3359         if(get_bits1(gb)){
3360             h->mmco[0].opcode= MMCO_LONG;
3361             h->mmco[0].long_arg= 0;
3362             h->mmco_index= 1;
3363         }
3364     }else{
3365         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3366             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3367                 MMCOOpcode opcode= get_ue_golomb(gb);
3368
3369                 h->mmco[i].opcode= opcode;
3370                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3371                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3372 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3373                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3374                         return -1;
3375                     }*/
3376                 }
3377                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3378                     unsigned int long_arg= get_ue_golomb(gb);
3379                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3380                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3381                         return -1;
3382                     }
3383                     h->mmco[i].long_arg= long_arg;
3384                 }
3385
3386                 if(opcode > (unsigned)MMCO_LONG){
3387                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3388                     return -1;
3389                 }
3390                 if(opcode == MMCO_END)
3391                     break;
3392             }
3393             h->mmco_index= i;
3394         }else{
3395             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3396
3397             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3398                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3399                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3400                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3401                 h->mmco_index= 1;
3402                 if (FIELD_PICTURE) {
3403                     h->mmco[0].short_pic_num *= 2;
3404                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3405                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3406                     h->mmco_index= 2;
3407                 }
3408             }
3409         }
3410     }
3411
3412     return 0;
3413 }
3414
3415 static int init_poc(H264Context *h){
3416     MpegEncContext * const s = &h->s;
3417     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3418     int field_poc[2];
3419     Picture *cur = s->current_picture_ptr;
3420
3421     h->frame_num_offset= h->prev_frame_num_offset;
3422     if(h->frame_num < h->prev_frame_num)
3423         h->frame_num_offset += max_frame_num;
3424
3425     if(h->sps.poc_type==0){
3426         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3427
3428         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3429             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3430         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3431             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3432         else
3433             h->poc_msb = h->prev_poc_msb;
3434 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3435         field_poc[0] =
3436         field_poc[1] = h->poc_msb + h->poc_lsb;
3437         if(s->picture_structure == PICT_FRAME)
3438             field_poc[1] += h->delta_poc_bottom;
3439     }else if(h->sps.poc_type==1){
3440         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3441         int i;
3442
3443         if(h->sps.poc_cycle_length != 0)
3444             abs_frame_num = h->frame_num_offset + h->frame_num;
3445         else
3446             abs_frame_num = 0;
3447
3448         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3449             abs_frame_num--;
3450
3451         expected_delta_per_poc_cycle = 0;
3452         for(i=0; i < h->sps.poc_cycle_length; i++)
3453             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3454
3455         if(abs_frame_num > 0){
3456             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3457             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3458
3459             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3460             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3461                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3462         } else
3463             expectedpoc = 0;
3464
3465         if(h->nal_ref_idc == 0)
3466             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3467
3468         field_poc[0] = expectedpoc + h->delta_poc[0];
3469         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3470
3471         if(s->picture_structure == PICT_FRAME)
3472             field_poc[1] += h->delta_poc[1];
3473     }else{
3474         int poc= 2*(h->frame_num_offset + h->frame_num);
3475
3476         if(!h->nal_ref_idc)
3477             poc--;
3478
3479         field_poc[0]= poc;
3480         field_poc[1]= poc;
3481     }
3482
3483     if(s->picture_structure != PICT_BOTTOM_FIELD)
3484         s->current_picture_ptr->field_poc[0]= field_poc[0];
3485     if(s->picture_structure != PICT_TOP_FIELD)
3486         s->current_picture_ptr->field_poc[1]= field_poc[1];
3487     cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3488
3489     return 0;
3490 }
3491
3492
3493 /**
3494  * initialize scan tables
3495  */
3496 static void init_scan_tables(H264Context *h){
3497     MpegEncContext * const s = &h->s;
3498     int i;
3499     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3500         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3501         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3502     }else{
3503         for(i=0; i<16; i++){
3504 #define T(x) (x>>2) | ((x<<2) & 0xF)
3505             h->zigzag_scan[i] = T(zigzag_scan[i]);
3506             h-> field_scan[i] = T( field_scan[i]);
3507 #undef T
3508         }
3509     }
3510     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3511         memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
3512         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3513         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3514         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3515     }else{
3516         for(i=0; i<64; i++){
3517 #define T(x) (x>>3) | ((x&7)<<3)
3518             h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
3519             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3520             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3521             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3522 #undef T
3523         }
3524     }
3525     if(h->sps.transform_bypass){ //FIXME same ugly
3526         h->zigzag_scan_q0          = zigzag_scan;
3527         h->zigzag_scan8x8_q0       = zigzag_scan8x8;
3528         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3529         h->field_scan_q0           = field_scan;
3530         h->field_scan8x8_q0        = field_scan8x8;
3531         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3532     }else{
3533         h->zigzag_scan_q0          = h->zigzag_scan;
3534         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3535         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3536         h->field_scan_q0           = h->field_scan;
3537         h->field_scan8x8_q0        = h->field_scan8x8;
3538         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3539     }
3540 }
3541
3542 /**
3543  * Replicates H264 "master" context to thread contexts.
3544  */
3545 static void clone_slice(H264Context *dst, H264Context *src)
3546 {
3547     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3548     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3549     dst->s.current_picture      = src->s.current_picture;
3550     dst->s.linesize             = src->s.linesize;
3551     dst->s.uvlinesize           = src->s.uvlinesize;
3552     dst->s.first_field          = src->s.first_field;
3553
3554     dst->prev_poc_msb           = src->prev_poc_msb;
3555     dst->prev_poc_lsb           = src->prev_poc_lsb;
3556     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3557     dst->prev_frame_num         = src->prev_frame_num;
3558     dst->short_ref_count        = src->short_ref_count;
3559
3560     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3561     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3562     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3563     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3564
3565     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3566     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3567 }
3568
3569 /**
3570  * decodes a slice header.
3571  * This will also call MPV_common_init() and frame_start() as needed.
3572  *
3573  * @param h h264context
3574  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3575  *
3576  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
3577  */
3578 static int decode_slice_header(H264Context *h, H264Context *h0){
3579     MpegEncContext * const s = &h->s;
3580     MpegEncContext * const s0 = &h0->s;
3581     unsigned int first_mb_in_slice;
3582     unsigned int pps_id;
3583     int num_ref_idx_active_override_flag;
3584     unsigned int slice_type, tmp, i, j;
3585     int default_ref_list_done = 0;
3586     int last_pic_structure;
3587
3588     s->dropable= h->nal_ref_idc == 0;
3589
3590     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3591         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3592         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3593     }else{
3594         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3595         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3596     }
3597
3598     first_mb_in_slice= get_ue_golomb(&s->gb);
3599
3600     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3601         h0->current_slice = 0;
3602         if (!s0->first_field)
3603             s->current_picture_ptr= NULL;
3604     }
3605
3606     slice_type= get_ue_golomb(&s->gb);
3607     if(slice_type > 9){
3608         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3609         return -1;
3610     }
3611     if(slice_type > 4){
3612         slice_type -= 5;
3613         h->slice_type_fixed=1;
3614     }else
3615         h->slice_type_fixed=0;
3616
3617     slice_type= golomb_to_pict_type[ slice_type ];
3618     if (slice_type == FF_I_TYPE
3619         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3620         default_ref_list_done = 1;
3621     }
3622     h->slice_type= slice_type;
3623     h->slice_type_nos= slice_type & 3;
3624
3625     s->pict_type= h->slice_type; // to make a few old functions happy, it's wrong though
3626     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3627         av_log(h->s.avctx, AV_LOG_ERROR,
3628                "B picture before any references, skipping\n");
3629         return -1;
3630     }
3631
3632     pps_id= get_ue_golomb(&s->gb);
3633     if(pps_id>=MAX_PPS_COUNT){
3634         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3635         return -1;
3636     }
3637     if(!h0->pps_buffers[pps_id]) {
3638         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing PPS referenced\n");
3639         return -1;
3640     }
3641     h->pps= *h0->pps_buffers[pps_id];
3642
3643     if(!h0->sps_buffers[h->pps.sps_id]) {
3644         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS referenced\n");
3645         return -1;
3646     }
3647     h->sps = *h0->sps_buffers[h->pps.sps_id];
3648
3649     if(h == h0 && h->dequant_coeff_pps != pps_id){
3650         h->dequant_coeff_pps = pps_id;
3651         init_dequant_tables(h);
3652     }
3653
3654     s->mb_width= h->sps.mb_width;
3655     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3656
3657     h->b_stride=  s->mb_width*4;
3658     h->b8_stride= s->mb_width*2;
3659
3660     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
3661     if(h->sps.frame_mbs_only_flag)
3662         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
3663     else
3664         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 3);
3665
3666     if (s->context_initialized
3667         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3668         if(h != h0)
3669             return -1;   // width / height changed during parallelized decoding
3670         free_tables(h);
3671         flush_dpb(s->avctx);
3672         MPV_common_end(s);
3673     }
3674     if (!s->context_initialized) {
3675         if(h != h0)
3676             return -1;  // we cant (re-)initialize context during parallel decoding
3677         if (MPV_common_init(s) < 0)
3678             return -1;
3679         s->first_field = 0;
3680
3681         init_scan_tables(h);
3682         alloc_tables(h);
3683
3684         for(i = 1; i < s->avctx->thread_count; i++) {
3685             H264Context *c;
3686             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3687             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3688             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3689             c->sps = h->sps;
3690             c->pps = h->pps;
3691             init_scan_tables(c);
3692             clone_tables(c, h);
3693         }
3694
3695         for(i = 0; i < s->avctx->thread_count; i++)
3696             if(context_init(h->thread_context[i]) < 0)
3697                 return -1;
3698
3699         s->avctx->width = s->width;
3700         s->avctx->height = s->height;
3701         s->avctx->sample_aspect_ratio= h->sps.sar;
3702         if(!s->avctx->sample_aspect_ratio.den)
3703             s->avctx->sample_aspect_ratio.den = 1;
3704
3705         if(h->sps.timing_info_present_flag){
3706             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
3707             if(h->x264_build > 0 && h->x264_build < 44)
3708                 s->avctx->time_base.den *= 2;
3709             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3710                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3711         }
3712     }
3713
3714     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3715
3716     h->mb_mbaff = 0;
3717     h->mb_aff_frame = 0;
3718     last_pic_structure = s0->picture_structure;
3719     if(h->sps.frame_mbs_only_flag){
3720         s->picture_structure= PICT_FRAME;
3721     }else{
3722         if(get_bits1(&s->gb)) { //field_pic_flag
3723             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3724         } else {
3725             s->picture_structure= PICT_FRAME;
3726             h->mb_aff_frame = h->sps.mb_aff;
3727         }
3728     }
3729     h->mb_field_decoding_flag= s->picture_structure != PICT_FRAME;
3730
3731     if(h0->current_slice == 0){
3732         while(h->frame_num !=  h->prev_frame_num &&
3733               h->frame_num != (h->prev_frame_num+1)%(1<<h->sps.log2_max_frame_num)){
3734             av_log(NULL, AV_LOG_DEBUG, "Frame num gap %d %d\n", h->frame_num, h->prev_frame_num);
3735             frame_start(h);
3736             h->prev_frame_num++;
3737             h->prev_frame_num %= 1<<h->sps.log2_max_frame_num;
3738             s->current_picture_ptr->frame_num= h->prev_frame_num;
3739             execute_ref_pic_marking(h, NULL, 0);
3740         }
3741
3742         /* See if we have a decoded first field looking for a pair... */
3743         if (s0->first_field) {
3744             assert(s0->current_picture_ptr);
3745             assert(s0->current_picture_ptr->data[0]);
3746             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
3747
3748             /* figure out if we have a complementary field pair */
3749             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
3750                 /*
3751                  * Previous field is unmatched. Don't display it, but let it
3752                  * remain for reference if marked as such.
3753                  */
3754                 s0->current_picture_ptr = NULL;
3755                 s0->first_field = FIELD_PICTURE;
3756
3757             } else {
3758                 if (h->nal_ref_idc &&
3759                         s0->current_picture_ptr->reference &&
3760                         s0->current_picture_ptr->frame_num != h->frame_num) {
3761                     /*
3762                      * This and previous field were reference, but had
3763                      * different frame_nums. Consider this field first in
3764                      * pair. Throw away previous field except for reference
3765                      * purposes.
3766                      */
3767                     s0->first_field = 1;
3768                     s0->current_picture_ptr = NULL;
3769
3770                 } else {
3771                     /* Second field in complementary pair */
3772                     s0->first_field = 0;
3773                 }
3774             }
3775
3776         } else {
3777             /* Frame or first field in a potentially complementary pair */
3778             assert(!s0->current_picture_ptr);
3779             s0->first_field = FIELD_PICTURE;
3780         }
3781
3782         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
3783             s0->first_field = 0;
3784             return -1;
3785         }
3786     }
3787     if(h != h0)
3788         clone_slice(h, h0);
3789
3790     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
3791
3792     assert(s->mb_num == s->mb_width * s->mb_height);
3793     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
3794        first_mb_in_slice                    >= s->mb_num){
3795         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
3796         return -1;
3797     }
3798     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
3799     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
3800     if (s->picture_structure == PICT_BOTTOM_FIELD)
3801         s->resync_mb_y = s->mb_y = s->mb_y + 1;
3802     assert(s->mb_y < s->mb_height);
3803
3804     if(s->picture_structure==PICT_FRAME){
3805         h->curr_pic_num=   h->frame_num;
3806         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
3807     }else{
3808         h->curr_pic_num= 2*h->frame_num + 1;
3809         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
3810     }
3811
3812     if(h->nal_unit_type == NAL_IDR_SLICE){
3813         get_ue_golomb(&s->gb); /* idr_pic_id */
3814     }
3815
3816     if(h->sps.poc_type==0){
3817         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
3818
3819         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
3820             h->delta_poc_bottom= get_se_golomb(&s->gb);
3821         }
3822     }
3823
3824     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
3825         h->delta_poc[0]= get_se_golomb(&s->gb);
3826
3827         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
3828             h->delta_poc[1]= get_se_golomb(&s->gb);
3829     }
3830
3831     init_poc(h);
3832
3833     if(h->pps.redundant_pic_cnt_present){
3834         h->redundant_pic_count= get_ue_golomb(&s->gb);
3835     }
3836
3837     //set defaults, might be overridden a few lines later
3838     h->ref_count[0]= h->pps.ref_count[0];
3839     h->ref_count[1]= h->pps.ref_count[1];
3840
3841     if(h->slice_type_nos != FF_I_TYPE){
3842         if(h->slice_type_nos == FF_B_TYPE){
3843             h->direct_spatial_mv_pred= get_bits1(&s->gb);
3844         }
3845         num_ref_idx_active_override_flag= get_bits1(&s->gb);
3846
3847         if(num_ref_idx_active_override_flag){
3848             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
3849             if(h->slice_type_nos==FF_B_TYPE)
3850                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
3851
3852             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
3853                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
3854                 h->ref_count[0]= h->ref_count[1]= 1;
3855                 return -1;
3856             }
3857         }
3858         if(h->slice_type_nos == FF_B_TYPE)
3859             h->list_count= 2;
3860         else
3861             h->list_count= 1;
3862     }else
3863         h->list_count= 0;
3864
3865     if(!default_ref_list_done){
3866         fill_default_ref_list(h);
3867     }
3868
3869     if(h->slice_type_nos!=FF_I_TYPE && decode_ref_pic_list_reordering(h) < 0)
3870         return -1;
3871
3872     if(h->slice_type_nos!=FF_I_TYPE){
3873         s->last_picture_ptr= &h->ref_list[0][0];
3874         ff_copy_picture(&s->last_picture, s->last_picture_ptr);
3875     }
3876     if(h->slice_type_nos==FF_B_TYPE){
3877         s->next_picture_ptr= &h->ref_list[1][0];
3878         ff_copy_picture(&s->next_picture, s->next_picture_ptr);
3879     }
3880
3881     if(   (h->pps.weighted_pred          && h->slice_type_nos == FF_P_TYPE )
3882        ||  (h->pps.weighted_bipred_idc==1 && h->slice_type_nos== FF_B_TYPE ) )
3883         pred_weight_table(h);
3884     else if(h->pps.weighted_bipred_idc==2 && h->slice_type_nos== FF_B_TYPE)
3885         implicit_weight_table(h);
3886     else
3887         h->use_weight = 0;
3888
3889     if(h->nal_ref_idc)
3890         decode_ref_pic_marking(h0, &s->gb);
3891
3892     if(FRAME_MBAFF)
3893         fill_mbaff_ref_list(h);
3894
3895     if(h->slice_type_nos==FF_B_TYPE && !h->direct_spatial_mv_pred)
3896         direct_dist_scale_factor(h);
3897     direct_ref_list_init(h);
3898
3899     if( h->slice_type_nos != FF_I_TYPE && h->pps.cabac ){
3900         tmp = get_ue_golomb(&s->gb);
3901         if(tmp > 2){
3902             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
3903             return -1;
3904         }
3905         h->cabac_init_idc= tmp;
3906     }
3907
3908     h->last_qscale_diff = 0;
3909     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
3910     if(tmp>51){
3911         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
3912         return -1;
3913     }
3914     s->qscale= tmp;
3915     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
3916     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
3917     //FIXME qscale / qp ... stuff
3918     if(h->slice_type == FF_SP_TYPE){
3919         get_bits1(&s->gb); /* sp_for_switch_flag */
3920     }
3921     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
3922         get_se_golomb(&s->gb); /* slice_qs_delta */
3923     }
3924
3925     h->deblocking_filter = 1;
3926     h->slice_alpha_c0_offset = 0;
3927     h->slice_beta_offset = 0;
3928     if( h->pps.deblocking_filter_parameters_present ) {
3929         tmp= get_ue_golomb(&s->gb);
3930         if(tmp > 2){
3931             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
3932             return -1;
3933         }
3934         h->deblocking_filter= tmp;
3935         if(h->deblocking_filter < 2)
3936             h->deblocking_filter^= 1; // 1<->0
3937
3938         if( h->deblocking_filter ) {
3939             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
3940             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
3941         }
3942     }
3943
3944     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
3945        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type_nos != FF_I_TYPE)
3946        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type_nos == FF_B_TYPE)
3947        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
3948         h->deblocking_filter= 0;
3949
3950     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
3951         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
3952             /* Cheat slightly for speed:
3953                Do not bother to deblock across slices. */
3954             h->deblocking_filter = 2;
3955         } else {
3956             h0->max_contexts = 1;
3957             if(!h0->single_decode_warning) {
3958                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
3959                 h0->single_decode_warning = 1;
3960             }
3961             if(h != h0)
3962                 return 1; // deblocking switched inside frame
3963         }
3964     }
3965
3966 #if 0 //FMO
3967     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
3968         slice_group_change_cycle= get_bits(&s->gb, ?);
3969 #endif
3970
3971     h0->last_slice_type = slice_type;
3972     h->slice_num = ++h0->current_slice;
3973     if(h->slice_num >= MAX_SLICES){
3974         av_log(s->avctx, AV_LOG_ERROR, "Too many slices, increase MAX_SLICES and recompile\n");
3975     }
3976
3977     for(j=0; j<2; j++){
3978         int *ref2frm= h->ref2frm[h->slice_num&(MAX_SLICES-1)][j];
3979         ref2frm[0]=
3980         ref2frm[1]= -1;
3981         for(i=0; i<16; i++)
3982             ref2frm[i+2]= 4*h->ref_list[j][i].frame_num
3983                           +(h->ref_list[j][i].reference&3);
3984         ref2frm[18+0]=
3985         ref2frm[18+1]= -1;
3986         for(i=16; i<48; i++)
3987             ref2frm[i+4]= 4*h->ref_list[j][i].frame_num
3988                           +(h->ref_list[j][i].reference&3);
3989     }
3990
3991     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
3992     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
3993
3994     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
3995         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c%s%s pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
3996                h->slice_num,
3997                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
3998                first_mb_in_slice,
3999                av_get_pict_type_char(h->slice_type), h->slice_type_fixed ? " fix" : "", h->nal_unit_type == NAL_IDR_SLICE ? " IDR" : "",
4000                pps_id, h->frame_num,
4001                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4002                h->ref_count[0], h->ref_count[1],
4003                s->qscale,
4004                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4005                h->use_weight,
4006                h->use_weight==1 && h->use_weight_chroma ? "c" : "",
4007                h->slice_type == FF_B_TYPE ? (h->direct_spatial_mv_pred ? "SPAT" : "TEMP") : ""
4008                );
4009     }
4010
4011     return 0;
4012 }
4013
4014 /**
4015  *
4016  */
4017 static inline int get_level_prefix(GetBitContext *gb){
4018     unsigned int buf;
4019     int log;
4020
4021     OPEN_READER(re, gb);
4022     UPDATE_CACHE(re, gb);
4023     buf=GET_CACHE(re, gb);
4024
4025     log= 32 - av_log2(buf);
4026 #ifdef TRACE
4027     print_bin(buf>>(32-log), log);
4028     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4029 #endif
4030
4031     LAST_SKIP_BITS(re, gb, log);
4032     CLOSE_READER(re, gb);
4033
4034     return log-1;
4035 }
4036
4037 static inline int get_dct8x8_allowed(H264Context *h){
4038     int i;
4039     for(i=0; i<4; i++){
4040         if(!IS_SUB_8X8(h->sub_mb_type[i])
4041            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4042             return 0;
4043     }
4044     return 1;
4045 }
4046
4047 /**
4048  * decodes a residual block.
4049  * @param n block index
4050  * @param scantable scantable
4051  * @param max_coeff number of coefficients in the block
4052  * @return <0 if an error occurred
4053  */
4054 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4055     MpegEncContext * const s = &h->s;
4056     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4057     int level[16];
4058     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4059
4060     //FIXME put trailing_onex into the context
4061
4062     if(n == CHROMA_DC_BLOCK_INDEX){
4063         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4064         total_coeff= coeff_token>>2;
4065     }else{
4066         if(n == LUMA_DC_BLOCK_INDEX){
4067             total_coeff= pred_non_zero_count(h, 0);
4068             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4069             total_coeff= coeff_token>>2;
4070         }else{
4071             total_coeff= pred_non_zero_count(h, n);
4072             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4073             total_coeff= coeff_token>>2;
4074             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4075         }
4076     }
4077
4078     //FIXME set last_non_zero?
4079
4080     if(total_coeff==0)
4081         return 0;
4082     if(total_coeff > (unsigned)max_coeff) {
4083         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4084         return -1;
4085     }
4086
4087     trailing_ones= coeff_token&3;
4088     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4089     assert(total_coeff<=16);
4090
4091     i = show_bits(gb, 3);
4092     skip_bits(gb, trailing_ones);
4093     level[0] = 1-((i&4)>>1);
4094     level[1] = 1-((i&2)   );
4095     level[2] = 1-((i&1)<<1);
4096
4097     if(trailing_ones<total_coeff) {
4098         int level_code, mask;
4099         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4100         int prefix= get_level_prefix(gb);
4101
4102         //first coefficient has suffix_length equal to 0 or 1
4103         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4104             if(suffix_length)
4105                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4106             else
4107                 level_code= (prefix<<suffix_length); //part
4108         }else if(prefix==14){
4109             if(suffix_length)
4110                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4111             else
4112                 level_code= prefix + get_bits(gb, 4); //part
4113         }else{
4114             level_code= (15<<suffix_length) + get_bits(gb, prefix-3); //part
4115             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4116             if(prefix>=16)
4117                 level_code += (1<<(prefix-3))-4096;
4118         }
4119
4120         if(trailing_ones < 3) level_code += 2;
4121
4122         suffix_length = 1;
4123         if(level_code > 5)
4124             suffix_length++;
4125         mask= -(level_code&1);
4126         level[trailing_ones]= (((2+level_code)>>1) ^ mask) - mask;
4127
4128         //remaining coefficients have suffix_length > 0
4129         for(i=trailing_ones+1;i<total_coeff;i++) {
4130             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
4131             prefix = get_level_prefix(gb);
4132             if(prefix<15){
4133                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4134             }else{
4135                 level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
4136                 if(prefix>=16)
4137                     level_code += (1<<(prefix-3))-4096;
4138             }
4139             mask= -(level_code&1);
4140             level[i]= (((2+level_code)>>1) ^ mask) - mask;
4141             if(level_code > suffix_limit[suffix_length])
4142                 suffix_length++;
4143         }
4144     }
4145
4146     if(total_coeff == max_coeff)
4147         zeros_left=0;
4148     else{
4149         if(n == CHROMA_DC_BLOCK_INDEX)
4150             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4151         else
4152             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4153     }
4154
4155     coeff_num = zeros_left + total_coeff - 1;
4156     j = scantable[coeff_num];
4157     if(n > 24){
4158         block[j] = level[0];
4159         for(i=1;i<total_coeff;i++) {
4160             if(zeros_left <= 0)
4161                 run_before = 0;
4162             else if(zeros_left < 7){
4163                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4164             }else{
4165                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4166             }
4167             zeros_left -= run_before;
4168             coeff_num -= 1 + run_before;
4169             j= scantable[ coeff_num ];
4170
4171             block[j]= level[i];
4172         }
4173     }else{
4174         block[j] = (level[0] * qmul[j] + 32)>>6;
4175         for(i=1;i<total_coeff;i++) {
4176             if(zeros_left <= 0)
4177                 run_before = 0;
4178             else if(zeros_left < 7){
4179                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4180             }else{
4181                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4182             }
4183             zeros_left -= run_before;
4184             coeff_num -= 1 + run_before;
4185             j= scantable[ coeff_num ];
4186
4187             block[j]= (level[i] * qmul[j] + 32)>>6;
4188         }
4189     }
4190
4191     if(zeros_left<0){
4192         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4193         return -1;
4194     }
4195
4196     return 0;
4197 }
4198
4199 static void predict_field_decoding_flag(H264Context *h){
4200     MpegEncContext * const s = &h->s;
4201     const int mb_xy= h->mb_xy;
4202     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4203                 ? s->current_picture.mb_type[mb_xy-1]
4204                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4205                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4206                 : 0;
4207     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4208 }
4209
4210 /**
4211  * decodes a P_SKIP or B_SKIP macroblock
4212  */
4213 static void decode_mb_skip(H264Context *h){
4214     MpegEncContext * const s = &h->s;
4215     const int mb_xy= h->mb_xy;
4216     int mb_type=0;
4217
4218     memset(h->non_zero_count[mb_xy], 0, 16);
4219     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4220
4221     if(MB_FIELD)
4222         mb_type|= MB_TYPE_INTERLACED;
4223
4224     if( h->slice_type_nos == FF_B_TYPE )
4225     {
4226         // just for fill_caches. pred_direct_motion will set the real mb_type
4227         mb_type|= MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4228
4229         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4230         pred_direct_motion(h, &mb_type);
4231         mb_type|= MB_TYPE_SKIP;
4232     }
4233     else
4234     {
4235         int mx, my;
4236         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4237
4238         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4239         pred_pskip_motion(h, &mx, &my);
4240         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4241         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4242     }
4243
4244     write_back_motion(h, mb_type);
4245     s->current_picture.mb_type[mb_xy]= mb_type;
4246     s->current_picture.qscale_table[mb_xy]= s->qscale;
4247     h->slice_table[ mb_xy ]= h->slice_num;
4248     h->prev_mb_skipped= 1;
4249 }
4250
4251 /**
4252  * decodes a macroblock
4253  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4254  */
4255 static int decode_mb_cavlc(H264Context *h){
4256     MpegEncContext * const s = &h->s;
4257     int mb_xy;
4258     int partition_count;
4259     unsigned int mb_type, cbp;
4260     int dct8x8_allowed= h->pps.transform_8x8_mode;
4261
4262     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4263
4264     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
4265
4266     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4267     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4268                 down the code */
4269     if(h->slice_type_nos != FF_I_TYPE){
4270         if(s->mb_skip_run==-1)
4271             s->mb_skip_run= get_ue_golomb(&s->gb);
4272
4273         if (s->mb_skip_run--) {
4274             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4275                 if(s->mb_skip_run==0)
4276                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4277                 else
4278                     predict_field_decoding_flag(h);
4279             }
4280             decode_mb_skip(h);
4281             return 0;
4282         }
4283     }
4284     if(FRAME_MBAFF){
4285         if( (s->mb_y&1) == 0 )
4286             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4287     }
4288
4289     h->prev_mb_skipped= 0;
4290
4291     mb_type= get_ue_golomb(&s->gb);
4292     if(h->slice_type_nos == FF_B_TYPE){
4293         if(mb_type < 23){
4294             partition_count= b_mb_type_info[mb_type].partition_count;
4295             mb_type=         b_mb_type_info[mb_type].type;
4296         }else{
4297             mb_type -= 23;
4298             goto decode_intra_mb;
4299         }
4300     }else if(h->slice_type_nos == FF_P_TYPE){
4301         if(mb_type < 5){
4302             partition_count= p_mb_type_info[mb_type].partition_count;
4303             mb_type=         p_mb_type_info[mb_type].type;
4304         }else{
4305             mb_type -= 5;
4306             goto decode_intra_mb;
4307         }
4308     }else{
4309        assert(h->slice_type_nos == FF_I_TYPE);
4310         if(h->slice_type == FF_SI_TYPE && mb_type)
4311             mb_type--;
4312 decode_intra_mb:
4313         if(mb_type > 25){
4314             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4315             return -1;
4316         }
4317         partition_count=0;
4318         cbp= i_mb_type_info[mb_type].cbp;
4319         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4320         mb_type= i_mb_type_info[mb_type].type;
4321     }
4322
4323     if(MB_FIELD)
4324         mb_type |= MB_TYPE_INTERLACED;
4325
4326     h->slice_table[ mb_xy ]= h->slice_num;
4327
4328     if(IS_INTRA_PCM(mb_type)){
4329         unsigned int x;
4330
4331         // We assume these blocks are very rare so we do not optimize it.
4332         align_get_bits(&s->gb);
4333
4334         // The pixels are stored in the same order as levels in h->mb array.
4335         for(x=0; x < (CHROMA ? 384 : 256); x++){
4336             ((uint8_t*)h->mb)[x]= get_bits(&s->gb, 8);
4337         }
4338
4339         // In deblocking, the quantizer is 0
4340         s->current_picture.qscale_table[mb_xy]= 0;
4341         // All coeffs are present
4342         memset(h->non_zero_count[mb_xy], 16, 16);
4343
4344         s->current_picture.mb_type[mb_xy]= mb_type;
4345         return 0;
4346     }
4347
4348     if(MB_MBAFF){
4349         h->ref_count[0] <<= 1;
4350         h->ref_count[1] <<= 1;
4351     }
4352
4353     fill_caches(h, mb_type, 0);
4354
4355     //mb_pred
4356     if(IS_INTRA(mb_type)){
4357         int pred_mode;
4358 //            init_top_left_availability(h);
4359         if(IS_INTRA4x4(mb_type)){
4360             int i;
4361             int di = 1;
4362             if(dct8x8_allowed && get_bits1(&s->gb)){
4363                 mb_type |= MB_TYPE_8x8DCT;
4364                 di = 4;
4365             }
4366
4367 //                fill_intra4x4_pred_table(h);
4368             for(i=0; i<16; i+=di){
4369                 int mode= pred_intra_mode(h, i);
4370
4371                 if(!get_bits1(&s->gb)){
4372                     const int rem_mode= get_bits(&s->gb, 3);
4373                     mode = rem_mode + (rem_mode >= mode);
4374                 }
4375
4376                 if(di==4)
4377                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4378                 else
4379                     h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4380             }
4381             write_back_intra_pred_mode(h);
4382             if( check_intra4x4_pred_mode(h) < 0)
4383                 return -1;
4384         }else{
4385             h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4386             if(h->intra16x16_pred_mode < 0)
4387                 return -1;
4388         }
4389         if(CHROMA){
4390             pred_mode= check_intra_pred_mode(h, get_ue_golomb(&s->gb));
4391             if(pred_mode < 0)
4392                 return -1;
4393             h->chroma_pred_mode= pred_mode;
4394         }
4395     }else if(partition_count==4){
4396         int i, j, sub_partition_count[4], list, ref[2][4];
4397
4398         if(h->slice_type_nos == FF_B_TYPE){
4399             for(i=0; i<4; i++){
4400                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4401                 if(h->sub_mb_type[i] >=13){
4402                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4403                     return -1;
4404                 }
4405                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4406                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4407             }
4408             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4409                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4410                 pred_direct_motion(h, &mb_type);
4411                 h->ref_cache[0][scan8[4]] =
4412                 h->ref_cache[1][scan8[4]] =
4413                 h->ref_cache[0][scan8[12]] =
4414                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4415             }
4416         }else{
4417             assert(h->slice_type_nos == FF_P_TYPE); //FIXME SP correct ?
4418             for(i=0; i<4; i++){
4419                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4420                 if(h->sub_mb_type[i] >=4){
4421                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4422                     return -1;
4423                 }
4424                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4425                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4426             }
4427         }
4428
4429         for(list=0; list<h->list_count; list++){
4430             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4431             for(i=0; i<4; i++){
4432                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4433                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4434                     unsigned int tmp = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
4435                     if(tmp>=ref_count){
4436                         av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4437                         return -1;
4438                     }
4439                     ref[list][i]= tmp;
4440                 }else{
4441                  //FIXME
4442                     ref[list][i] = -1;
4443                 }
4444             }
4445         }
4446
4447         if(dct8x8_allowed)
4448             dct8x8_allowed = get_dct8x8_allowed(h);
4449
4450         for(list=0; list<h->list_count; list++){
4451             for(i=0; i<4; i++){
4452                 if(IS_DIRECT(h->sub_mb_type[i])) {
4453                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4454                     continue;
4455                 }
4456                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4457                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4458
4459                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4460                     const int sub_mb_type= h->sub_mb_type[i];
4461                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4462                     for(j=0; j<sub_partition_count[i]; j++){
4463                         int mx, my;
4464                         const int index= 4*i + block_width*j;
4465                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4466                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4467                         mx += get_se_golomb(&s->gb);
4468                         my += get_se_golomb(&s->gb);
4469                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4470
4471                         if(IS_SUB_8X8(sub_mb_type)){
4472                             mv_cache[ 1 ][0]=
4473                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4474                             mv_cache[ 1 ][1]=
4475                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4476                         }else if(IS_SUB_8X4(sub_mb_type)){
4477                             mv_cache[ 1 ][0]= mx;
4478                             mv_cache[ 1 ][1]= my;
4479                         }else if(IS_SUB_4X8(sub_mb_type)){
4480                             mv_cache[ 8 ][0]= mx;
4481                             mv_cache[ 8 ][1]= my;
4482                         }
4483                         mv_cache[ 0 ][0]= mx;
4484                         mv_cache[ 0 ][1]= my;
4485                     }
4486                 }else{
4487                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4488                     p[0] = p[1]=
4489                     p[8] = p[9]= 0;
4490                 }
4491             }
4492         }
4493     }else if(IS_DIRECT(mb_type)){
4494         pred_direct_motion(h, &mb_type);
4495         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4496     }else{
4497         int list, mx, my, i;
4498          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4499         if(IS_16X16(mb_type)){
4500             for(list=0; list<h->list_count; list++){
4501                     unsigned int val;
4502                     if(IS_DIR(mb_type, 0, list)){
4503                         val= get_te0_golomb(&s->gb, h->ref_count[list]);
4504                         if(val >= h->ref_count[list]){
4505                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4506                             return -1;
4507                         }
4508                     }else
4509                         val= LIST_NOT_USED&0xFF;
4510                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4511             }
4512             for(list=0; list<h->list_count; list++){
4513                 unsigned int val;
4514                 if(IS_DIR(mb_type, 0, list)){
4515                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4516                     mx += get_se_golomb(&s->gb);
4517                     my += get_se_golomb(&s->gb);
4518                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4519
4520                     val= pack16to32(mx,my);
4521                 }else
4522                     val=0;
4523                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4524             }
4525         }
4526         else if(IS_16X8(mb_type)){
4527             for(list=0; list<h->list_count; list++){
4528                     for(i=0; i<2; i++){
4529                         unsigned int val;
4530                         if(IS_DIR(mb_type, i, list)){
4531                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4532                             if(val >= h->ref_count[list]){
4533                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4534                                 return -1;
4535                             }
4536                         }else
4537                             val= LIST_NOT_USED&0xFF;
4538                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4539                     }
4540             }
4541             for(list=0; list<h->list_count; list++){
4542                 for(i=0; i<2; i++){
4543                     unsigned int val;
4544                     if(IS_DIR(mb_type, i, list)){
4545                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4546                         mx += get_se_golomb(&s->gb);
4547                         my += get_se_golomb(&s->gb);
4548                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4549
4550                         val= pack16to32(mx,my);
4551                     }else
4552                         val=0;
4553                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4554                 }
4555             }
4556         }else{
4557             assert(IS_8X16(mb_type));
4558             for(list=0; list<h->list_count; list++){
4559                     for(i=0; i<2; i++){
4560                         unsigned int val;
4561                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4562                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4563                             if(val >= h->ref_count[list]){
4564                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4565                                 return -1;
4566                             }
4567                         }else
4568                             val= LIST_NOT_USED&0xFF;
4569                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4570                     }
4571             }
4572             for(list=0; list<h->list_count; list++){
4573                 for(i=0; i<2; i++){
4574                     unsigned int val;
4575                     if(IS_DIR(mb_type, i, list)){
4576                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4577                         mx += get_se_golomb(&s->gb);
4578                         my += get_se_golomb(&s->gb);
4579                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4580
4581                         val= pack16to32(mx,my);
4582                     }else
4583                         val=0;
4584                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4585                 }
4586             }
4587         }
4588     }
4589
4590     if(IS_INTER(mb_type))
4591         write_back_motion(h, mb_type);
4592
4593     if(!IS_INTRA16x16(mb_type)){
4594         cbp= get_ue_golomb(&s->gb);
4595         if(cbp > 47){
4596             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4597             return -1;
4598         }
4599
4600         if(CHROMA){
4601             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp[cbp];
4602             else                     cbp= golomb_to_inter_cbp   [cbp];
4603         }else{
4604             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp_gray[cbp];
4605             else                     cbp= golomb_to_inter_cbp_gray[cbp];
4606         }
4607     }
4608     h->cbp = cbp;
4609
4610     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4611         if(get_bits1(&s->gb)){
4612             mb_type |= MB_TYPE_8x8DCT;
4613             h->cbp_table[mb_xy]= cbp;
4614         }
4615     }
4616     s->current_picture.mb_type[mb_xy]= mb_type;
4617
4618     if(cbp || IS_INTRA16x16(mb_type)){
4619         int i8x8, i4x4, chroma_idx;
4620         int dquant;
4621         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4622         const uint8_t *scan, *scan8x8, *dc_scan;
4623
4624 //        fill_non_zero_count_cache(h);
4625
4626         if(IS_INTERLACED(mb_type)){
4627             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4628             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4629             dc_scan= luma_dc_field_scan;
4630         }else{
4631             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4632             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4633             dc_scan= luma_dc_zigzag_scan;
4634         }
4635
4636         dquant= get_se_golomb(&s->gb);
4637
4638         if( dquant > 25 || dquant < -26 ){
4639             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4640             return -1;
4641         }
4642
4643         s->qscale += dquant;
4644         if(((unsigned)s->qscale) > 51){
4645             if(s->qscale<0) s->qscale+= 52;
4646             else            s->qscale-= 52;
4647         }
4648
4649         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4650         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4651         if(IS_INTRA16x16(mb_type)){
4652             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4653                 return -1; //FIXME continue if partitioned and other return -1 too
4654             }
4655
4656             assert((cbp&15) == 0 || (cbp&15) == 15);
4657
4658             if(cbp&15){
4659                 for(i8x8=0; i8x8<4; i8x8++){
4660                     for(i4x4=0; i4x4<4; i4x4++){
4661                         const int index= i4x4 + 4*i8x8;
4662                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4663                             return -1;
4664                         }
4665                     }
4666                 }
4667             }else{
4668                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4669             }
4670         }else{
4671             for(i8x8=0; i8x8<4; i8x8++){
4672                 if(cbp & (1<<i8x8)){
4673                     if(IS_8x8DCT(mb_type)){
4674                         DCTELEM *buf = &h->mb[64*i8x8];
4675                         uint8_t *nnz;
4676                         for(i4x4=0; i4x4<4; i4x4++){
4677                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4678                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4679                                 return -1;
4680                         }
4681                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4682                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4683                     }else{
4684                         for(i4x4=0; i4x4<4; i4x4++){
4685                             const int index= i4x4 + 4*i8x8;
4686
4687                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4688                                 return -1;
4689                             }
4690                         }
4691                     }
4692                 }else{
4693                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4694                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4695                 }
4696             }
4697         }
4698
4699         if(cbp&0x30){
4700             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4701                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4702                     return -1;
4703                 }
4704         }
4705
4706         if(cbp&0x20){
4707             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4708                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4709                 for(i4x4=0; i4x4<4; i4x4++){
4710                     const int index= 16 + 4*chroma_idx + i4x4;
4711                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4712                         return -1;
4713                     }
4714                 }
4715             }
4716         }else{
4717             uint8_t * const nnz= &h->non_zero_count_cache[0];
4718             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4719             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4720         }
4721     }else{
4722         uint8_t * const nnz= &h->non_zero_count_cache[0];
4723         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4724         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4725         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4726     }
4727     s->current_picture.qscale_table[mb_xy]= s->qscale;
4728     write_back_non_zero_count(h);
4729
4730     if(MB_MBAFF){
4731         h->ref_count[0] >>= 1;
4732         h->ref_count[1] >>= 1;
4733     }
4734
4735     return 0;
4736 }
4737
4738 static int decode_cabac_field_decoding_flag(H264Context *h) {
4739     MpegEncContext * const s = &h->s;
4740     const int mb_x = s->mb_x;
4741     const int mb_y = s->mb_y & ~1;
4742     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4743     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4744
4745     unsigned int ctx = 0;
4746
4747     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
4748         ctx += 1;
4749     }
4750     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
4751         ctx += 1;
4752     }
4753
4754     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
4755 }
4756
4757 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4758     uint8_t *state= &h->cabac_state[ctx_base];
4759     int mb_type;
4760
4761     if(intra_slice){
4762         MpegEncContext * const s = &h->s;
4763         const int mba_xy = h->left_mb_xy[0];
4764         const int mbb_xy = h->top_mb_xy;
4765         int ctx=0;
4766         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
4767             ctx++;
4768         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
4769             ctx++;
4770         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
4771             return 0;   /* I4x4 */
4772         state += 2;
4773     }else{
4774         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
4775             return 0;   /* I4x4 */
4776     }
4777
4778     if( get_cabac_terminate( &h->cabac ) )
4779         return 25;  /* PCM */
4780
4781     mb_type = 1; /* I16x16 */
4782     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
4783     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
4784         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
4785     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
4786     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
4787     return mb_type;
4788 }
4789
4790 static int decode_cabac_mb_type( H264Context *h ) {
4791     MpegEncContext * const s = &h->s;
4792
4793     if( h->slice_type_nos == FF_I_TYPE ) {
4794         return decode_cabac_intra_mb_type(h, 3, 1);
4795     } else if( h->slice_type_nos == FF_P_TYPE ) {
4796         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
4797             /* P-type */
4798             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
4799                 /* P_L0_D16x16, P_8x8 */
4800                 return 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
4801             } else {
4802                 /* P_L0_D8x16, P_L0_D16x8 */
4803                 return 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
4804             }
4805         } else {
4806             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
4807         }
4808     } else {
4809         const int mba_xy = h->left_mb_xy[0];
4810         const int mbb_xy = h->top_mb_xy;
4811         int ctx = 0;
4812         int bits;
4813         assert(h->slice_type_nos == FF_B_TYPE);
4814
4815         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
4816             ctx++;
4817         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
4818             ctx++;
4819
4820         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
4821             return 0; /* B_Direct_16x16 */
4822
4823         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
4824             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
4825         }
4826
4827         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
4828         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
4829         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
4830         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4831         if( bits < 8 )
4832             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
4833         else if( bits == 13 ) {
4834             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
4835         } else if( bits == 14 )
4836             return 11; /* B_L1_L0_8x16 */
4837         else if( bits == 15 )
4838             return 22; /* B_8x8 */
4839
4840         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4841         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
4842     }
4843 }
4844
4845 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
4846     MpegEncContext * const s = &h->s;
4847     int mba_xy, mbb_xy;
4848     int ctx = 0;
4849
4850     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
4851         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
4852         mba_xy = mb_xy - 1;
4853         if( (mb_y&1)
4854             && h->slice_table[mba_xy] == h->slice_num
4855             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
4856             mba_xy += s->mb_stride;
4857         if( MB_FIELD ){
4858             mbb_xy = mb_xy - s->mb_stride;
4859             if( !(mb_y&1)
4860                 && h->slice_table[mbb_xy] == h->slice_num
4861                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
4862                 mbb_xy -= s->mb_stride;
4863         }else
4864             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
4865     }else{
4866         int mb_xy = h->mb_xy;
4867         mba_xy = mb_xy - 1;
4868         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
4869     }
4870
4871     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
4872         ctx++;
4873     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
4874         ctx++;
4875
4876     if( h->slice_type_nos == FF_B_TYPE )
4877         ctx += 13;
4878     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
4879 }
4880
4881 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
4882     int mode = 0;
4883
4884     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
4885         return pred_mode;
4886
4887     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
4888     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
4889     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
4890
4891     if( mode >= pred_mode )
4892         return mode + 1;
4893     else
4894         return mode;
4895 }
4896
4897 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
4898     const int mba_xy = h->left_mb_xy[0];
4899     const int mbb_xy = h->top_mb_xy;
4900
4901     int ctx = 0;
4902
4903     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
4904     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
4905         ctx++;
4906
4907     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
4908         ctx++;
4909
4910     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
4911         return 0;
4912
4913     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4914         return 1;
4915     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4916         return 2;
4917     else
4918         return 3;
4919 }
4920
4921 static int decode_cabac_mb_cbp_luma( H264Context *h) {
4922     int cbp_b, cbp_a, ctx, cbp = 0;
4923
4924     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
4925     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
4926
4927     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
4928     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
4929     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
4930     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
4931     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
4932     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
4933     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
4934     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
4935     return cbp;
4936 }
4937 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
4938     int ctx;
4939     int cbp_a, cbp_b;
4940
4941     cbp_a = (h->left_cbp>>4)&0x03;
4942     cbp_b = (h-> top_cbp>>4)&0x03;
4943
4944     ctx = 0;
4945     if( cbp_a > 0 ) ctx++;
4946     if( cbp_b > 0 ) ctx += 2;
4947     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
4948         return 0;
4949
4950     ctx = 4;
4951     if( cbp_a == 2 ) ctx++;
4952     if( cbp_b == 2 ) ctx += 2;
4953     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
4954 }
4955 static int decode_cabac_mb_dqp( H264Context *h) {
4956     int   ctx = 0;
4957     int   val = 0;
4958
4959     if( h->last_qscale_diff != 0 )
4960         ctx++;
4961
4962     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
4963         if( ctx < 2 )
4964             ctx = 2;
4965         else
4966             ctx = 3;
4967         val++;
4968         if(val > 102) //prevent infinite loop
4969             return INT_MIN;
4970     }
4971
4972     if( val&0x01 )
4973         return (val + 1)/2;
4974     else
4975         return -(val + 1)/2;
4976 }
4977 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
4978     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
4979         return 0;   /* 8x8 */
4980     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
4981         return 1;   /* 8x4 */
4982     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
4983         return 2;   /* 4x8 */
4984     return 3;       /* 4x4 */
4985 }
4986 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
4987     int type;
4988     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
4989         return 0;   /* B_Direct_8x8 */
4990     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
4991         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
4992     type = 3;
4993     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
4994         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
4995             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
4996         type += 4;
4997     }
4998     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
4999     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5000     return type;
5001 }
5002
5003 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5004     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5005 }
5006
5007 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5008     int refa = h->ref_cache[list][scan8[n] - 1];
5009     int refb = h->ref_cache[list][scan8[n] - 8];
5010     int ref  = 0;
5011     int ctx  = 0;
5012
5013     if( h->slice_type_nos == FF_B_TYPE) {
5014         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5015             ctx++;
5016         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5017             ctx += 2;
5018     } else {
5019         if( refa > 0 )
5020             ctx++;
5021         if( refb > 0 )
5022             ctx += 2;
5023     }
5024
5025     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5026         ref++;
5027         if( ctx < 4 )
5028             ctx = 4;
5029         else
5030             ctx = 5;
5031         if(ref >= 32 /*h->ref_list[list]*/){
5032             return -1;
5033         }
5034     }
5035     return ref;
5036 }
5037
5038 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5039     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5040                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5041     int ctxbase = (l == 0) ? 40 : 47;
5042     int mvd;
5043     int ctx = (amvd>2) + (amvd>32);
5044
5045     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5046         return 0;
5047
5048     mvd= 1;
5049     ctx= 3;
5050     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5051         mvd++;
5052         if( ctx < 6 )
5053             ctx++;
5054     }
5055
5056     if( mvd >= 9 ) {
5057         int k = 3;
5058         while( get_cabac_bypass( &h->cabac ) ) {
5059             mvd += 1 << k;
5060             k++;
5061             if(k>24){
5062                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5063                 return INT_MIN;
5064             }
5065         }
5066         while( k-- ) {
5067             if( get_cabac_bypass( &h->cabac ) )
5068                 mvd += 1 << k;
5069         }
5070     }
5071     return get_cabac_bypass_sign( &h->cabac, -mvd );
5072 }
5073
5074 static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
5075     int nza, nzb;
5076     int ctx = 0;
5077
5078     if( is_dc ) {
5079         if( cat == 0 ) {
5080             nza = h->left_cbp&0x100;
5081             nzb = h-> top_cbp&0x100;
5082         } else {
5083             nza = (h->left_cbp>>(6+idx))&0x01;
5084             nzb = (h-> top_cbp>>(6+idx))&0x01;
5085         }
5086     } else {
5087         if( cat == 4 ) {
5088             nza = h->non_zero_count_cache[scan8[16+idx] - 1];
5089             nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
5090         } else {
5091             assert(cat == 1 || cat == 2);
5092             nza = h->non_zero_count_cache[scan8[idx] - 1];
5093             nzb = h->non_zero_count_cache[scan8[idx] - 8];
5094         }
5095     }
5096
5097     if( nza > 0 )
5098         ctx++;
5099
5100     if( nzb > 0 )
5101         ctx += 2;
5102
5103     return ctx + 4 * cat;
5104 }
5105
5106 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8[63]) = {
5107     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5108     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5109     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5110     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5111 };
5112
5113 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
5114     static const int significant_coeff_flag_offset[2][6] = {
5115       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5116       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5117     };
5118     static const int last_coeff_flag_offset[2][6] = {
5119       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5120       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5121     };
5122     static const int coeff_abs_level_m1_offset[6] = {
5123         227+0, 227+10, 227+20, 227+30, 227+39, 426
5124     };
5125     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5126       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5127         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5128         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5129        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5130       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5131         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5132         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5133         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5134     };
5135     /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
5136      * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
5137      * map node ctx => cabac ctx for level=1 */
5138     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
5139     /* map node ctx => cabac ctx for level>1 */
5140     static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
5141     static const uint8_t coeff_abs_level_transition[2][8] = {
5142     /* update node ctx after decoding a level=1 */
5143         { 1, 2, 3, 3, 4, 5, 6, 7 },
5144     /* update node ctx after decoding a level>1 */
5145         { 4, 4, 4, 4, 5, 6, 7, 7 }
5146     };
5147
5148     int index[64];
5149
5150     int av_unused last;
5151     int coeff_count = 0;
5152     int node_ctx = 0;
5153
5154     uint8_t *significant_coeff_ctx_base;
5155     uint8_t *last_coeff_ctx_base;
5156     uint8_t *abs_level_m1_ctx_base;
5157
5158 #ifndef ARCH_X86
5159 #define CABAC_ON_STACK
5160 #endif
5161 #ifdef CABAC_ON_STACK
5162 #define CC &cc
5163     CABACContext cc;
5164     cc.range     = h->cabac.range;
5165     cc.low       = h->cabac.low;
5166     cc.bytestream= h->cabac.bytestream;
5167 #else
5168 #define CC &h->cabac
5169 #endif
5170
5171
5172     /* cat: 0-> DC 16x16  n = 0
5173      *      1-> AC 16x16  n = luma4x4idx
5174      *      2-> Luma4x4   n = luma4x4idx
5175      *      3-> DC Chroma n = iCbCr
5176      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
5177      *      5-> Luma8x8   n = 4 * luma8x8idx
5178      */
5179
5180     /* read coded block flag */
5181     if( is_dc || cat != 5 ) {
5182         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
5183             if( !is_dc ) {
5184                 if( cat == 4 )
5185                     h->non_zero_count_cache[scan8[16+n]] = 0;
5186                 else
5187                     h->non_zero_count_cache[scan8[n]] = 0;
5188             }
5189
5190 #ifdef CABAC_ON_STACK
5191             h->cabac.range     = cc.range     ;
5192             h->cabac.low       = cc.low       ;
5193             h->cabac.bytestream= cc.bytestream;
5194 #endif
5195             return;
5196         }
5197     }
5198
5199     significant_coeff_ctx_base = h->cabac_state
5200         + significant_coeff_flag_offset[MB_FIELD][cat];
5201     last_coeff_ctx_base = h->cabac_state
5202         + last_coeff_flag_offset[MB_FIELD][cat];
5203     abs_level_m1_ctx_base = h->cabac_state
5204         + coeff_abs_level_m1_offset[cat];
5205
5206     if( !is_dc && cat == 5 ) {
5207 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5208         for(last= 0; last < coefs; last++) { \
5209             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5210             if( get_cabac( CC, sig_ctx )) { \
5211                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5212                 index[coeff_count++] = last; \
5213                 if( get_cabac( CC, last_ctx ) ) { \
5214                     last= max_coeff; \
5215                     break; \
5216                 } \
5217             } \
5218         }\
5219         if( last == max_coeff -1 ) {\
5220             index[coeff_count++] = last;\
5221         }
5222         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5223 #if defined(ARCH_X86) && defined(HAVE_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
5224         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5225     } else {
5226         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5227 #else
5228         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5229     } else {
5230         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5231 #endif
5232     }
5233     assert(coeff_count > 0);
5234
5235     if( is_dc ) {
5236         if( cat == 0 )
5237             h->cbp_table[h->mb_xy] |= 0x100;
5238         else
5239             h->cbp_table[h->mb_xy] |= 0x40 << n;
5240     } else {
5241         if( cat == 5 )
5242             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5243         else if( cat == 4 )
5244             h->non_zero_count_cache[scan8[16+n]] = coeff_count;
5245         else {
5246             assert( cat == 1 || cat == 2 );
5247             h->non_zero_count_cache[scan8[n]] = coeff_count;
5248         }
5249     }
5250
5251     do {
5252         uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
5253
5254         int j= scantable[index[--coeff_count]];
5255
5256         if( get_cabac( CC, ctx ) == 0 ) {
5257             node_ctx = coeff_abs_level_transition[0][node_ctx];
5258             if( is_dc ) {
5259                 block[j] = get_cabac_bypass_sign( CC, -1);
5260             }else{
5261                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5262             }
5263         } else {
5264             int coeff_abs = 2;
5265             ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
5266             node_ctx = coeff_abs_level_transition[1][node_ctx];
5267
5268             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5269                 coeff_abs++;
5270             }
5271
5272             if( coeff_abs >= 15 ) {
5273                 int j = 0;
5274                 while( get_cabac_bypass( CC ) ) {
5275                     j++;
5276                 }
5277
5278                 coeff_abs=1;
5279                 while( j-- ) {
5280                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5281                 }
5282                 coeff_abs+= 14;
5283             }
5284
5285             if( is_dc ) {
5286                 block[j] = get_cabac_bypass_sign( CC, -coeff_abs );
5287             }else{
5288                 block[j] = (get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32) >> 6;
5289             }
5290         }
5291     } while( coeff_count );
5292 #ifdef CABAC_ON_STACK
5293             h->cabac.range     = cc.range     ;
5294             h->cabac.low       = cc.low       ;
5295             h->cabac.bytestream= cc.bytestream;
5296 #endif
5297
5298 }
5299
5300 #ifndef CONFIG_SMALL
5301 static void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5302     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 1);
5303 }
5304
5305 static void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5306     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
5307 }
5308 #endif
5309
5310 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5311 #ifdef CONFIG_SMALL
5312     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, cat == 0 || cat == 3);
5313 #else
5314     if( cat == 0 || cat == 3 ) decode_cabac_residual_dc(h, block, cat, n, scantable, qmul, max_coeff);
5315     else decode_cabac_residual_nondc(h, block, cat, n, scantable, qmul, max_coeff);
5316 #endif
5317 }
5318
5319 static inline void compute_mb_neighbors(H264Context *h)
5320 {
5321     MpegEncContext * const s = &h->s;
5322     const int mb_xy  = h->mb_xy;
5323     h->top_mb_xy     = mb_xy - s->mb_stride;
5324     h->left_mb_xy[0] = mb_xy - 1;
5325     if(FRAME_MBAFF){
5326         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5327         const int top_pair_xy      = pair_xy     - s->mb_stride;
5328         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5329         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5330         const int curr_mb_frame_flag = !MB_FIELD;
5331         const int bottom = (s->mb_y & 1);
5332         if (bottom
5333                 ? !curr_mb_frame_flag // bottom macroblock
5334                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
5335                 ) {
5336             h->top_mb_xy -= s->mb_stride;
5337         }
5338         if (left_mb_frame_flag != curr_mb_frame_flag) {
5339             h->left_mb_xy[0] = pair_xy - 1;
5340         }
5341     } else if (FIELD_PICTURE) {
5342         h->top_mb_xy -= s->mb_stride;
5343     }
5344     return;
5345 }
5346
5347 /**
5348  * decodes a macroblock
5349  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5350  */
5351 static int decode_mb_cabac(H264Context *h) {
5352     MpegEncContext * const s = &h->s;
5353     int mb_xy;
5354     int mb_type, partition_count, cbp = 0;
5355     int dct8x8_allowed= h->pps.transform_8x8_mode;
5356
5357     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5358
5359     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
5360
5361     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5362     if( h->slice_type_nos != FF_I_TYPE ) {
5363         int skip;
5364         /* a skipped mb needs the aff flag from the following mb */
5365         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5366             predict_field_decoding_flag(h);
5367         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5368             skip = h->next_mb_skipped;
5369         else
5370             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5371         /* read skip flags */
5372         if( skip ) {
5373             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5374                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5375                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5376                 if(h->next_mb_skipped)
5377                     predict_field_decoding_flag(h);
5378                 else
5379                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5380             }
5381
5382             decode_mb_skip(h);
5383
5384             h->cbp_table[mb_xy] = 0;
5385             h->chroma_pred_mode_table[mb_xy] = 0;
5386             h->last_qscale_diff = 0;
5387
5388             return 0;
5389
5390         }
5391     }
5392     if(FRAME_MBAFF){
5393         if( (s->mb_y&1) == 0 )
5394             h->mb_mbaff =
5395             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5396     }
5397
5398     h->prev_mb_skipped = 0;
5399
5400     compute_mb_neighbors(h);
5401     mb_type = decode_cabac_mb_type( h );
5402     assert(mb_type >= 0);
5403
5404     if( h->slice_type_nos == FF_B_TYPE ) {
5405         if( mb_type < 23 ){
5406             partition_count= b_mb_type_info[mb_type].partition_count;
5407             mb_type=         b_mb_type_info[mb_type].type;
5408         }else{
5409             mb_type -= 23;
5410             goto decode_intra_mb;
5411         }
5412     } else if( h->slice_type_nos == FF_P_TYPE ) {
5413         if( mb_type < 5) {
5414             partition_count= p_mb_type_info[mb_type].partition_count;
5415             mb_type=         p_mb_type_info[mb_type].type;
5416         } else {
5417             mb_type -= 5;
5418             goto decode_intra_mb;
5419         }
5420     } else {
5421         if(h->slice_type == FF_SI_TYPE && mb_type)
5422             mb_type--;
5423         assert(h->slice_type_nos == FF_I_TYPE);
5424 decode_intra_mb:
5425         partition_count = 0;
5426         cbp= i_mb_type_info[mb_type].cbp;
5427         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5428         mb_type= i_mb_type_info[mb_type].type;
5429     }
5430     if(MB_FIELD)
5431         mb_type |= MB_TYPE_INTERLACED;
5432
5433     h->slice_table[ mb_xy ]= h->slice_num;
5434
5435     if(IS_INTRA_PCM(mb_type)) {
5436         const uint8_t *ptr;
5437
5438         // We assume these blocks are very rare so we do not optimize it.
5439         // FIXME The two following lines get the bitstream position in the cabac
5440         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5441         ptr= h->cabac.bytestream;
5442         if(h->cabac.low&0x1) ptr--;
5443         if(CABAC_BITS==16){
5444             if(h->cabac.low&0x1FF) ptr--;
5445         }
5446
5447         // The pixels are stored in the same order as levels in h->mb array.
5448         memcpy(h->mb, ptr, 256); ptr+=256;
5449         if(CHROMA){
5450             memcpy(h->mb+128, ptr, 128); ptr+=128;
5451         }
5452
5453         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5454
5455         // All blocks are present
5456         h->cbp_table[mb_xy] = 0x1ef;
5457         h->chroma_pred_mode_table[mb_xy] = 0;
5458         // In deblocking, the quantizer is 0
5459         s->current_picture.qscale_table[mb_xy]= 0;
5460         // All coeffs are present
5461         memset(h->non_zero_count[mb_xy], 16, 16);
5462         s->current_picture.mb_type[mb_xy]= mb_type;
5463         h->last_qscale_diff = 0;
5464         return 0;
5465     }
5466
5467     if(MB_MBAFF){
5468         h->ref_count[0] <<= 1;
5469         h->ref_count[1] <<= 1;
5470     }
5471
5472     fill_caches(h, mb_type, 0);
5473
5474     if( IS_INTRA( mb_type ) ) {
5475         int i, pred_mode;
5476         if( IS_INTRA4x4( mb_type ) ) {
5477             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5478                 mb_type |= MB_TYPE_8x8DCT;
5479                 for( i = 0; i < 16; i+=4 ) {
5480                     int pred = pred_intra_mode( h, i );
5481                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5482                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5483                 }
5484             } else {
5485                 for( i = 0; i < 16; i++ ) {
5486                     int pred = pred_intra_mode( h, i );
5487                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5488
5489                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5490                 }
5491             }
5492             write_back_intra_pred_mode(h);
5493             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5494         } else {
5495             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5496             if( h->intra16x16_pred_mode < 0 ) return -1;
5497         }
5498         if(CHROMA){
5499             h->chroma_pred_mode_table[mb_xy] =
5500             pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5501
5502             pred_mode= check_intra_pred_mode( h, pred_mode );
5503             if( pred_mode < 0 ) return -1;
5504             h->chroma_pred_mode= pred_mode;
5505         }
5506     } else if( partition_count == 4 ) {
5507         int i, j, sub_partition_count[4], list, ref[2][4];
5508
5509         if( h->slice_type_nos == FF_B_TYPE ) {
5510             for( i = 0; i < 4; i++ ) {
5511                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5512                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5513                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5514             }
5515             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5516                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5517                 pred_direct_motion(h, &mb_type);
5518                 h->ref_cache[0][scan8[4]] =
5519                 h->ref_cache[1][scan8[4]] =
5520                 h->ref_cache[0][scan8[12]] =
5521                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5522                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5523                     for( i = 0; i < 4; i++ )
5524                         if( IS_DIRECT(h->sub_mb_type[i]) )
5525                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5526                 }
5527             }
5528         } else {
5529             for( i = 0; i < 4; i++ ) {
5530                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5531                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5532                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5533             }
5534         }
5535
5536         for( list = 0; list < h->list_count; list++ ) {
5537                 for( i = 0; i < 4; i++ ) {
5538                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5539                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5540                         if( h->ref_count[list] > 1 ){
5541                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5542                             if(ref[list][i] >= (unsigned)h->ref_count[list]){
5543                                 av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref[list][i], h->ref_count[list]);
5544                                 return -1;
5545                             }
5546                         }else
5547                             ref[list][i] = 0;
5548                     } else {
5549                         ref[list][i] = -1;
5550                     }
5551                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5552                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5553                 }
5554         }
5555
5556         if(dct8x8_allowed)
5557             dct8x8_allowed = get_dct8x8_allowed(h);
5558
5559         for(list=0; list<h->list_count; list++){
5560             for(i=0; i<4; i++){
5561                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5562                 if(IS_DIRECT(h->sub_mb_type[i])){
5563                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5564                     continue;
5565                 }
5566
5567                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5568                     const int sub_mb_type= h->sub_mb_type[i];
5569                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5570                     for(j=0; j<sub_partition_count[i]; j++){
5571                         int mpx, mpy;
5572                         int mx, my;
5573                         const int index= 4*i + block_width*j;
5574                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5575                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5576                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5577
5578                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5579                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5580                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5581
5582                         if(IS_SUB_8X8(sub_mb_type)){
5583                             mv_cache[ 1 ][0]=
5584                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5585                             mv_cache[ 1 ][1]=
5586                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5587
5588                             mvd_cache[ 1 ][0]=
5589                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5590                             mvd_cache[ 1 ][1]=
5591                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5592                         }else if(IS_SUB_8X4(sub_mb_type)){
5593                             mv_cache[ 1 ][0]= mx;
5594                             mv_cache[ 1 ][1]= my;
5595
5596                             mvd_cache[ 1 ][0]= mx - mpx;
5597                             mvd_cache[ 1 ][1]= my - mpy;
5598                         }else if(IS_SUB_4X8(sub_mb_type)){
5599                             mv_cache[ 8 ][0]= mx;
5600                             mv_cache[ 8 ][1]= my;
5601
5602                             mvd_cache[ 8 ][0]= mx - mpx;
5603                             mvd_cache[ 8 ][1]= my - mpy;
5604                         }
5605                         mv_cache[ 0 ][0]= mx;
5606                         mv_cache[ 0 ][1]= my;
5607
5608                         mvd_cache[ 0 ][0]= mx - mpx;
5609                         mvd_cache[ 0 ][1]= my - mpy;
5610                     }
5611                 }else{
5612                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5613                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5614                     p[0] = p[1] = p[8] = p[9] = 0;
5615                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5616                 }
5617             }
5618         }
5619     } else if( IS_DIRECT(mb_type) ) {
5620         pred_direct_motion(h, &mb_type);
5621         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5622         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5623         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5624     } else {
5625         int list, mx, my, i, mpx, mpy;
5626         if(IS_16X16(mb_type)){
5627             for(list=0; list<h->list_count; list++){
5628                 if(IS_DIR(mb_type, 0, list)){
5629                     int ref;
5630                     if(h->ref_count[list] > 1){
5631                         ref= decode_cabac_mb_ref(h, list, 0);
5632                         if(ref >= (unsigned)h->ref_count[list]){
5633                             av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5634                             return -1;
5635                         }
5636                     }else
5637                         ref=0;
5638                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5639                 }else
5640                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5641             }
5642             for(list=0; list<h->list_count; list++){
5643                 if(IS_DIR(mb_type, 0, list)){
5644                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5645
5646                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5647                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5648                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5649
5650                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5651                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5652                 }else
5653                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5654             }
5655         }
5656         else if(IS_16X8(mb_type)){
5657             for(list=0; list<h->list_count; list++){
5658                     for(i=0; i<2; i++){
5659                         if(IS_DIR(mb_type, i, list)){
5660                             int ref;
5661                             if(h->ref_count[list] > 1){
5662                                 ref= decode_cabac_mb_ref( h, list, 8*i );
5663                                 if(ref >= (unsigned)h->ref_count[list]){
5664                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5665                                     return -1;
5666                                 }
5667                             }else
5668                                 ref=0;
5669                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5670                         }else
5671                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5672                     }
5673             }
5674             for(list=0; list<h->list_count; list++){
5675                 for(i=0; i<2; i++){
5676                     if(IS_DIR(mb_type, i, list)){
5677                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5678                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5679                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5680                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5681
5682                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5683                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5684                     }else{
5685                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5686                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5687                     }
5688                 }
5689             }
5690         }else{
5691             assert(IS_8X16(mb_type));
5692             for(list=0; list<h->list_count; list++){
5693                     for(i=0; i<2; i++){
5694                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5695                             int ref;
5696                             if(h->ref_count[list] > 1){
5697                                 ref= decode_cabac_mb_ref( h, list, 4*i );
5698                                 if(ref >= (unsigned)h->ref_count[list]){
5699                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5700                                     return -1;
5701                                 }
5702                             }else
5703                                 ref=0;
5704                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5705                         }else
5706                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5707                     }
5708             }
5709             for(list=0; list<h->list_count; list++){
5710                 for(i=0; i<2; i++){
5711                     if(IS_DIR(mb_type, i, list)){
5712                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5713                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5714                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5715
5716                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5717                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5718                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5719                     }else{
5720                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5721                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5722                     }
5723                 }
5724             }
5725         }
5726     }
5727
5728    if( IS_INTER( mb_type ) ) {
5729         h->chroma_pred_mode_table[mb_xy] = 0;
5730         write_back_motion( h, mb_type );
5731    }
5732
5733     if( !IS_INTRA16x16( mb_type ) ) {
5734         cbp  = decode_cabac_mb_cbp_luma( h );
5735         if(CHROMA)
5736             cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5737     }
5738
5739     h->cbp_table[mb_xy] = h->cbp = cbp;
5740
5741     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5742         if( decode_cabac_mb_transform_size( h ) )
5743             mb_type |= MB_TYPE_8x8DCT;
5744     }
5745     s->current_picture.mb_type[mb_xy]= mb_type;
5746
5747     if( cbp || IS_INTRA16x16( mb_type ) ) {
5748         const uint8_t *scan, *scan8x8, *dc_scan;
5749         const uint32_t *qmul;
5750         int dqp;
5751
5752         if(IS_INTERLACED(mb_type)){
5753             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5754             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5755             dc_scan= luma_dc_field_scan;
5756         }else{
5757             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5758             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5759             dc_scan= luma_dc_zigzag_scan;
5760         }
5761
5762         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5763         if( dqp == INT_MIN ){
5764             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5765             return -1;
5766         }
5767         s->qscale += dqp;
5768         if(((unsigned)s->qscale) > 51){
5769             if(s->qscale<0) s->qscale+= 52;
5770             else            s->qscale-= 52;
5771         }
5772         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5773         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5774
5775         if( IS_INTRA16x16( mb_type ) ) {
5776             int i;
5777             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5778             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5779
5780             if( cbp&15 ) {
5781                 qmul = h->dequant4_coeff[0][s->qscale];
5782                 for( i = 0; i < 16; i++ ) {
5783                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5784                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
5785                 }
5786             } else {
5787                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5788             }
5789         } else {
5790             int i8x8, i4x4;
5791             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5792                 if( cbp & (1<<i8x8) ) {
5793                     if( IS_8x8DCT(mb_type) ) {
5794                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
5795                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
5796                     } else {
5797                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
5798                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
5799                             const int index = 4*i8x8 + i4x4;
5800                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
5801 //START_TIMER
5802                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
5803 //STOP_TIMER("decode_residual")
5804                         }
5805                     }
5806                 } else {
5807                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5808                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5809                 }
5810             }
5811         }
5812
5813         if( cbp&0x30 ){
5814             int c;
5815             for( c = 0; c < 2; c++ ) {
5816                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
5817                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
5818             }
5819         }
5820
5821         if( cbp&0x20 ) {
5822             int c, i;
5823             for( c = 0; c < 2; c++ ) {
5824                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
5825                 for( i = 0; i < 4; i++ ) {
5826                     const int index = 16 + 4 * c + i;
5827                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
5828                     decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, qmul, 15);
5829                 }
5830             }
5831         } else {
5832             uint8_t * const nnz= &h->non_zero_count_cache[0];
5833             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5834             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5835         }
5836     } else {
5837         uint8_t * const nnz= &h->non_zero_count_cache[0];
5838         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5839         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5840         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5841         h->last_qscale_diff = 0;
5842     }
5843
5844     s->current_picture.qscale_table[mb_xy]= s->qscale;
5845     write_back_non_zero_count(h);
5846
5847     if(MB_MBAFF){
5848         h->ref_count[0] >>= 1;
5849         h->ref_count[1] >>= 1;
5850     }
5851
5852     return 0;
5853 }
5854
5855
5856 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5857     int i, d;
5858     const int index_a = qp + h->slice_alpha_c0_offset;
5859     const int alpha = (alpha_table+52)[index_a];
5860     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5861
5862     if( bS[0] < 4 ) {
5863         int8_t tc[4];
5864         for(i=0; i<4; i++)
5865             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
5866         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
5867     } else {
5868         /* 16px edge length, because bS=4 is triggered by being at
5869          * the edge of an intra MB, so all 4 bS are the same */
5870             for( d = 0; d < 16; d++ ) {
5871                 const int p0 = pix[-1];
5872                 const int p1 = pix[-2];
5873                 const int p2 = pix[-3];
5874
5875                 const int q0 = pix[0];
5876                 const int q1 = pix[1];
5877                 const int q2 = pix[2];
5878
5879                 if( FFABS( p0 - q0 ) < alpha &&
5880                     FFABS( p1 - p0 ) < beta &&
5881                     FFABS( q1 - q0 ) < beta ) {
5882
5883                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
5884                         if( FFABS( p2 - p0 ) < beta)
5885                         {
5886                             const int p3 = pix[-4];
5887                             /* p0', p1', p2' */
5888                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
5889                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
5890                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
5891                         } else {
5892                             /* p0' */
5893                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5894                         }
5895                         if( FFABS( q2 - q0 ) < beta)
5896                         {
5897                             const int q3 = pix[3];
5898                             /* q0', q1', q2' */
5899                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
5900                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
5901                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
5902                         } else {
5903                             /* q0' */
5904                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5905                         }
5906                     }else{
5907                         /* p0', q0' */
5908                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5909                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5910                     }
5911                     tprintf(h->s.avctx, "filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
5912                 }
5913                 pix += stride;
5914             }
5915     }
5916 }
5917 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5918     int i;
5919     const int index_a = qp + h->slice_alpha_c0_offset;
5920     const int alpha = (alpha_table+52)[index_a];
5921     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5922
5923     if( bS[0] < 4 ) {
5924         int8_t tc[4];
5925         for(i=0; i<4; i++)
5926             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
5927         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
5928     } else {
5929         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
5930     }
5931 }
5932
5933 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
5934     int i;
5935     for( i = 0; i < 16; i++, pix += stride) {
5936         int index_a;
5937         int alpha;
5938         int beta;
5939
5940         int qp_index;
5941         int bS_index = (i >> 1);
5942         if (!MB_FIELD) {
5943             bS_index &= ~1;
5944             bS_index |= (i & 1);
5945         }
5946
5947         if( bS[bS_index] == 0 ) {
5948             continue;
5949         }
5950
5951         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
5952         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
5953         alpha = (alpha_table+52)[index_a];
5954         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
5955
5956         if( bS[bS_index] < 4 ) {
5957             const int tc0 = (tc0_table+52)[index_a][bS[bS_index] - 1];
5958             const int p0 = pix[-1];
5959             const int p1 = pix[-2];
5960             const int p2 = pix[-3];
5961             const int q0 = pix[0];
5962             const int q1 = pix[1];
5963             const int q2 = pix[2];
5964
5965             if( FFABS( p0 - q0 ) < alpha &&
5966                 FFABS( p1 - p0 ) < beta &&
5967                 FFABS( q1 - q0 ) < beta ) {
5968                 int tc = tc0;
5969                 int i_delta;
5970
5971                 if( FFABS( p2 - p0 ) < beta ) {
5972                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
5973                     tc++;
5974                 }
5975                 if( FFABS( q2 - q0 ) < beta ) {
5976                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
5977                     tc++;
5978                 }
5979
5980                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
5981                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
5982                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
5983                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
5984             }
5985         }else{
5986             const int p0 = pix[-1];
5987             const int p1 = pix[-2];
5988             const int p2 = pix[-3];
5989
5990             const int q0 = pix[0];
5991             const int q1 = pix[1];
5992             const int q2 = pix[2];
5993
5994             if( FFABS( p0 - q0 ) < alpha &&
5995                 FFABS( p1 - p0 ) < beta &&
5996                 FFABS( q1 - q0 ) < beta ) {
5997
5998                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
5999                     if( FFABS( p2 - p0 ) < beta)
6000                     {
6001                         const int p3 = pix[-4];
6002                         /* p0', p1', p2' */
6003                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6004                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6005                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6006                     } else {
6007                         /* p0' */
6008                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6009                     }
6010                     if( FFABS( q2 - q0 ) < beta)
6011                     {
6012                         const int q3 = pix[3];
6013                         /* q0', q1', q2' */
6014                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6015                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6016                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6017                     } else {
6018                         /* q0' */
6019                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6020                     }
6021                 }else{
6022                     /* p0', q0' */
6023                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6024                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6025                 }
6026                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6027             }
6028         }
6029     }
6030 }
6031 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6032     int i;
6033     for( i = 0; i < 8; i++, pix += stride) {
6034         int index_a;
6035         int alpha;
6036         int beta;
6037
6038         int qp_index;
6039         int bS_index = i;
6040
6041         if( bS[bS_index] == 0 ) {
6042             continue;
6043         }
6044
6045         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6046         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6047         alpha = (alpha_table+52)[index_a];
6048         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6049
6050         if( bS[bS_index] < 4 ) {
6051             const int tc = (tc0_table+52)[index_a][bS[bS_index] - 1] + 1;
6052             const int p0 = pix[-1];
6053             const int p1 = pix[-2];
6054             const int q0 = pix[0];
6055             const int q1 = pix[1];
6056
6057             if( FFABS( p0 - q0 ) < alpha &&
6058                 FFABS( p1 - p0 ) < beta &&
6059                 FFABS( q1 - q0 ) < beta ) {
6060                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6061
6062                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6063                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6064                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6065             }
6066         }else{
6067             const int p0 = pix[-1];
6068             const int p1 = pix[-2];
6069             const int q0 = pix[0];
6070             const int q1 = pix[1];
6071
6072             if( FFABS( p0 - q0 ) < alpha &&
6073                 FFABS( p1 - p0 ) < beta &&
6074                 FFABS( q1 - q0 ) < beta ) {
6075
6076                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6077                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6078                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6079             }
6080         }
6081     }
6082 }
6083
6084 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6085     int i, d;
6086     const int index_a = qp + h->slice_alpha_c0_offset;
6087     const int alpha = (alpha_table+52)[index_a];
6088     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6089     const int pix_next  = stride;
6090
6091     if( bS[0] < 4 ) {
6092         int8_t tc[4];
6093         for(i=0; i<4; i++)
6094             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6095         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6096     } else {
6097         /* 16px edge length, see filter_mb_edgev */
6098             for( d = 0; d < 16; d++ ) {
6099                 const int p0 = pix[-1*pix_next];
6100                 const int p1 = pix[-2*pix_next];
6101                 const int p2 = pix[-3*pix_next];
6102                 const int q0 = pix[0];
6103                 const int q1 = pix[1*pix_next];
6104                 const int q2 = pix[2*pix_next];
6105
6106                 if( FFABS( p0 - q0 ) < alpha &&
6107                     FFABS( p1 - p0 ) < beta &&
6108                     FFABS( q1 - q0 ) < beta ) {
6109
6110                     const int p3 = pix[-4*pix_next];
6111                     const int q3 = pix[ 3*pix_next];
6112
6113                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6114                         if( FFABS( p2 - p0 ) < beta) {
6115                             /* p0', p1', p2' */
6116                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6117                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6118                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6119                         } else {
6120                             /* p0' */
6121                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6122                         }
6123                         if( FFABS( q2 - q0 ) < beta) {
6124                             /* q0', q1', q2' */
6125                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6126                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6127                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6128                         } else {
6129                             /* q0' */
6130                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6131                         }
6132                     }else{
6133                         /* p0', q0' */
6134                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6135                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6136                     }
6137                     tprintf(h->s.avctx, "filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6138                 }
6139                 pix++;
6140             }
6141     }
6142 }
6143
6144 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6145     int i;
6146     const int index_a = qp + h->slice_alpha_c0_offset;
6147     const int alpha = (alpha_table+52)[index_a];
6148     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6149
6150     if( bS[0] < 4 ) {
6151         int8_t tc[4];
6152         for(i=0; i<4; i++)
6153             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6154         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6155     } else {
6156         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6157     }
6158 }
6159
6160 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6161     MpegEncContext * const s = &h->s;
6162     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6163     int mb_xy, mb_type;
6164     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6165
6166     mb_xy = h->mb_xy;
6167
6168     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6169         !(s->flags2 & CODEC_FLAG2_FAST) || //FIXME filter_mb_fast is broken, thus hasto be, but should not under CODEC_FLAG2_FAST
6170        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6171                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6172         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6173         return;
6174     }
6175     assert(!FRAME_MBAFF);
6176
6177     mb_type = s->current_picture.mb_type[mb_xy];
6178     qp = s->current_picture.qscale_table[mb_xy];
6179     qp0 = s->current_picture.qscale_table[mb_xy-1];
6180     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6181     qpc = get_chroma_qp( h, 0, qp );
6182     qpc0 = get_chroma_qp( h, 0, qp0 );
6183     qpc1 = get_chroma_qp( h, 0, qp1 );
6184     qp0 = (qp + qp0 + 1) >> 1;
6185     qp1 = (qp + qp1 + 1) >> 1;
6186     qpc0 = (qpc + qpc0 + 1) >> 1;
6187     qpc1 = (qpc + qpc1 + 1) >> 1;
6188     qp_thresh = 15 - h->slice_alpha_c0_offset;
6189     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6190        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6191         return;
6192
6193     if( IS_INTRA(mb_type) ) {
6194         int16_t bS4[4] = {4,4,4,4};
6195         int16_t bS3[4] = {3,3,3,3};
6196         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6197         if( IS_8x8DCT(mb_type) ) {
6198             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6199             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6200             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6201             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6202         } else {
6203             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6204             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6205             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6206             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6207             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6208             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6209             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6210             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6211         }
6212         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6213         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6214         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6215         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6216         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6217         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6218         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6219         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6220         return;
6221     } else {
6222         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6223         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6224         int edges;
6225         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6226             edges = 4;
6227             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6228         } else {
6229             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6230                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6231             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6232                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6233                              ? 3 : 0;
6234             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6235             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6236             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6237                                               (h->slice_type_nos == FF_B_TYPE), edges, step, mask_edge0, mask_edge1, FIELD_PICTURE);
6238         }
6239         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6240             bSv[0][0] = 0x0004000400040004ULL;
6241         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6242             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6243
6244 #define FILTER(hv,dir,edge)\
6245         if(bSv[dir][edge]) {\
6246             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6247             if(!(edge&1)) {\
6248                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6249                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6250             }\
6251         }
6252         if( edges == 1 ) {
6253             FILTER(v,0,0);
6254             FILTER(h,1,0);
6255         } else if( IS_8x8DCT(mb_type) ) {
6256             FILTER(v,0,0);
6257             FILTER(v,0,2);
6258             FILTER(h,1,0);
6259             FILTER(h,1,2);
6260         } else {
6261             FILTER(v,0,0);
6262             FILTER(v,0,1);
6263             FILTER(v,0,2);
6264             FILTER(v,0,3);
6265             FILTER(h,1,0);
6266             FILTER(h,1,1);
6267             FILTER(h,1,2);
6268             FILTER(h,1,3);
6269         }
6270 #undef FILTER
6271     }
6272 }
6273
6274
6275 static void av_always_inline filter_mb_dir(H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int mb_xy, int mb_type, int mvy_limit, int first_vertical_edge_done, int dir) {
6276     MpegEncContext * const s = &h->s;
6277     int edge;
6278     const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6279     const int mbm_type = s->current_picture.mb_type[mbm_xy];
6280     int (*ref2frm) [64] = h->ref2frm[ h->slice_num          &(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6281     int (*ref2frmm)[64] = h->ref2frm[ h->slice_table[mbm_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6282     int start = h->slice_table[mbm_xy] == 0xFFFF ? 1 : 0;
6283
6284     const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6285                               == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6286     // how often to recheck mv-based bS when iterating between edges
6287     const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6288                           (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6289     // how often to recheck mv-based bS when iterating along each edge
6290     const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6291
6292     if (first_vertical_edge_done) {
6293         start = 1;
6294     }
6295
6296     if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6297         start = 1;
6298
6299     if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6300         && !IS_INTERLACED(mb_type)
6301         && IS_INTERLACED(mbm_type)
6302         ) {
6303         // This is a special case in the norm where the filtering must
6304         // be done twice (one each of the field) even if we are in a
6305         // frame macroblock.
6306         //
6307         static const int nnz_idx[4] = {4,5,6,3};
6308         unsigned int tmp_linesize   = 2 *   linesize;
6309         unsigned int tmp_uvlinesize = 2 * uvlinesize;
6310         int mbn_xy = mb_xy - 2 * s->mb_stride;
6311         int qp;
6312         int i, j;
6313         int16_t bS[4];
6314
6315         for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6316             if( IS_INTRA(mb_type) ||
6317                 IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6318                 bS[0] = bS[1] = bS[2] = bS[3] = 3;
6319             } else {
6320                 const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6321                 for( i = 0; i < 4; i++ ) {
6322                     if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6323                         mbn_nnz[nnz_idx[i]] != 0 )
6324                         bS[i] = 2;
6325                     else
6326                         bS[i] = 1;
6327                 }
6328             }
6329             // Do not use s->qscale as luma quantizer because it has not the same
6330             // value in IPCM macroblocks.
6331             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6332             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6333             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6334             filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6335             filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6336                               ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6337             filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6338                               ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6339         }
6340
6341         start = 1;
6342     }
6343
6344     /* Calculate bS */
6345     for( edge = start; edge < edges; edge++ ) {
6346         /* mbn_xy: neighbor macroblock */
6347         const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6348         const int mbn_type = s->current_picture.mb_type[mbn_xy];
6349         int (*ref2frmn)[64] = edge > 0 ? ref2frm : ref2frmm;
6350         int16_t bS[4];
6351         int qp;
6352
6353         if( (edge&1) && IS_8x8DCT(mb_type) )
6354             continue;
6355
6356         if( IS_INTRA(mb_type) ||
6357             IS_INTRA(mbn_type) ) {
6358             int value;
6359             if (edge == 0) {
6360                 if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6361                     || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6362                 ) {
6363                     value = 4;
6364                 } else {
6365                     value = 3;
6366                 }
6367             } else {
6368                 value = 3;
6369             }
6370             bS[0] = bS[1] = bS[2] = bS[3] = value;
6371         } else {
6372             int i, l;
6373             int mv_done;
6374
6375             if( edge & mask_edge ) {
6376                 bS[0] = bS[1] = bS[2] = bS[3] = 0;
6377                 mv_done = 1;
6378             }
6379             else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6380                 bS[0] = bS[1] = bS[2] = bS[3] = 1;
6381                 mv_done = 1;
6382             }
6383             else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6384                 int b_idx= 8 + 4 + edge * (dir ? 8:1);
6385                 int bn_idx= b_idx - (dir ? 8:1);
6386                 int v = 0;
6387
6388                 for( l = 0; !v && l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6389                     v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6390                          FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6391                          FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6392                 }
6393
6394                 if(h->slice_type_nos == FF_B_TYPE && v){
6395                     v=0;
6396                     for( l = 0; !v && l < 2; l++ ) {
6397                         int ln= 1-l;
6398                         v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6399                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6400                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit;
6401                     }
6402                 }
6403
6404                 bS[0] = bS[1] = bS[2] = bS[3] = v;
6405                 mv_done = 1;
6406             }
6407             else
6408                 mv_done = 0;
6409
6410             for( i = 0; i < 4; i++ ) {
6411                 int x = dir == 0 ? edge : i;
6412                 int y = dir == 0 ? i    : edge;
6413                 int b_idx= 8 + 4 + x + 8*y;
6414                 int bn_idx= b_idx - (dir ? 8:1);
6415
6416                 if( h->non_zero_count_cache[b_idx] |
6417                     h->non_zero_count_cache[bn_idx] ) {
6418                     bS[i] = 2;
6419                 }
6420                 else if(!mv_done)
6421                 {
6422                     bS[i] = 0;
6423                     for( l = 0; l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6424                         if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6425                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6426                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6427                             bS[i] = 1;
6428                             break;
6429                         }
6430                     }
6431
6432                     if(h->slice_type_nos == FF_B_TYPE && bS[i]){
6433                         bS[i] = 0;
6434                         for( l = 0; l < 2; l++ ) {
6435                             int ln= 1-l;
6436                             if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6437                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6438                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) {
6439                                 bS[i] = 1;
6440                                 break;
6441                             }
6442                         }
6443                     }
6444                 }
6445             }
6446
6447             if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6448                 continue;
6449         }
6450
6451         /* Filter edge */
6452         // Do not use s->qscale as luma quantizer because it has not the same
6453         // value in IPCM macroblocks.
6454         qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6455         //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6456         tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6457         { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6458         if( dir == 0 ) {
6459             filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6460             if( (edge&1) == 0 ) {
6461                 filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6462                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6463                 filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6464                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6465             }
6466         } else {
6467             filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6468             if( (edge&1) == 0 ) {
6469                 filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6470                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6471                 filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6472                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6473             }
6474         }
6475     }
6476 }
6477
6478 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6479     MpegEncContext * const s = &h->s;
6480     const int mb_xy= mb_x + mb_y*s->mb_stride;
6481     const int mb_type = s->current_picture.mb_type[mb_xy];
6482     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6483     int first_vertical_edge_done = 0;
6484     int dir;
6485
6486     //for sufficiently low qp, filtering wouldn't do anything
6487     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6488     if(!FRAME_MBAFF){
6489         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
6490         int qp = s->current_picture.qscale_table[mb_xy];
6491         if(qp <= qp_thresh
6492            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6493            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6494             return;
6495         }
6496     }
6497
6498     // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
6499     if(!h->pps.cabac && h->pps.transform_8x8_mode){
6500         int top_type, left_type[2];
6501         top_type     = s->current_picture.mb_type[h->top_mb_xy]    ;
6502         left_type[0] = s->current_picture.mb_type[h->left_mb_xy[0]];
6503         left_type[1] = s->current_picture.mb_type[h->left_mb_xy[1]];
6504
6505         if(IS_8x8DCT(top_type)){
6506             h->non_zero_count_cache[4+8*0]=
6507             h->non_zero_count_cache[5+8*0]= h->cbp_table[h->top_mb_xy] & 4;
6508             h->non_zero_count_cache[6+8*0]=
6509             h->non_zero_count_cache[7+8*0]= h->cbp_table[h->top_mb_xy] & 8;
6510         }
6511         if(IS_8x8DCT(left_type[0])){
6512             h->non_zero_count_cache[3+8*1]=
6513             h->non_zero_count_cache[3+8*2]= h->cbp_table[h->left_mb_xy[0]]&2; //FIXME check MBAFF
6514         }
6515         if(IS_8x8DCT(left_type[1])){
6516             h->non_zero_count_cache[3+8*3]=
6517             h->non_zero_count_cache[3+8*4]= h->cbp_table[h->left_mb_xy[1]]&8; //FIXME check MBAFF
6518         }
6519
6520         if(IS_8x8DCT(mb_type)){
6521             h->non_zero_count_cache[scan8[0   ]]= h->non_zero_count_cache[scan8[1   ]]=
6522             h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp & 1;
6523
6524             h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
6525             h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp & 2;
6526
6527             h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
6528             h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp & 4;
6529
6530             h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
6531             h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp & 8;
6532         }
6533     }
6534
6535     if (FRAME_MBAFF
6536             // left mb is in picture
6537             && h->slice_table[mb_xy-1] != 0xFFFF
6538             // and current and left pair do not have the same interlaced type
6539             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6540             // and left mb is in the same slice if deblocking_filter == 2
6541             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6542         /* First vertical edge is different in MBAFF frames
6543          * There are 8 different bS to compute and 2 different Qp
6544          */
6545         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6546         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6547         int16_t bS[8];
6548         int qp[2];
6549         int bqp[2];
6550         int rqp[2];
6551         int mb_qp, mbn0_qp, mbn1_qp;
6552         int i;
6553         first_vertical_edge_done = 1;
6554
6555         if( IS_INTRA(mb_type) )
6556             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6557         else {
6558             for( i = 0; i < 8; i++ ) {
6559                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6560
6561                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6562                     bS[i] = 4;
6563                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6564                          ((!h->pps.cabac && IS_8x8DCT(s->current_picture.mb_type[mbn_xy])) ?
6565                             (h->cbp_table[mbn_xy] & ((MB_FIELD ? (i&2) : (mb_y&1)) ? 8 : 2))
6566                                                                        :
6567                             h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2]))
6568                     bS[i] = 2;
6569                 else
6570                     bS[i] = 1;
6571             }
6572         }
6573
6574         mb_qp = s->current_picture.qscale_table[mb_xy];
6575         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6576         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6577         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6578         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6579                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6580         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6581                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6582         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6583         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6584                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6585         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6586                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6587
6588         /* Filter edge */
6589         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6590         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6591         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6592         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6593         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6594     }
6595
6596 #ifdef CONFIG_SMALL
6597     for( dir = 0; dir < 2; dir++ )
6598         filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, dir ? 0 : first_vertical_edge_done, dir);
6599 #else
6600     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, first_vertical_edge_done, 0);
6601     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, 0, 1);
6602 #endif
6603 }
6604
6605 static int decode_slice(struct AVCodecContext *avctx, void *arg){
6606     H264Context *h = *(void**)arg;
6607     MpegEncContext * const s = &h->s;
6608     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6609
6610     s->mb_skip_run= -1;
6611
6612     h->is_complex = FRAME_MBAFF || s->picture_structure != PICT_FRAME || s->codec_id != CODEC_ID_H264 ||
6613                     (ENABLE_GRAY && (s->flags&CODEC_FLAG_GRAY)) || (ENABLE_H264_ENCODER && s->encoding);
6614
6615     if( h->pps.cabac ) {
6616         int i;
6617
6618         /* realign */
6619         align_get_bits( &s->gb );
6620
6621         /* init cabac */
6622         ff_init_cabac_states( &h->cabac);
6623         ff_init_cabac_decoder( &h->cabac,
6624                                s->gb.buffer + get_bits_count(&s->gb)/8,
6625                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6626         /* calculate pre-state */
6627         for( i= 0; i < 460; i++ ) {
6628             int pre;
6629             if( h->slice_type_nos == FF_I_TYPE )
6630                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6631             else
6632                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6633
6634             if( pre <= 63 )
6635                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6636             else
6637                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6638         }
6639
6640         for(;;){
6641 //START_TIMER
6642             int ret = decode_mb_cabac(h);
6643             int eos;
6644 //STOP_TIMER("decode_mb_cabac")
6645
6646             if(ret>=0) hl_decode_mb(h);
6647
6648             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6649                 s->mb_y++;
6650
6651                 if(ret>=0) ret = decode_mb_cabac(h);
6652
6653                 if(ret>=0) hl_decode_mb(h);
6654                 s->mb_y--;
6655             }
6656             eos = get_cabac_terminate( &h->cabac );
6657
6658             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6659                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6660                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6661                 return -1;
6662             }
6663
6664             if( ++s->mb_x >= s->mb_width ) {
6665                 s->mb_x = 0;
6666                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6667                 ++s->mb_y;
6668                 if(FIELD_OR_MBAFF_PICTURE) {
6669                     ++s->mb_y;
6670                 }
6671             }
6672
6673             if( eos || s->mb_y >= s->mb_height ) {
6674                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6675                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6676                 return 0;
6677             }
6678         }
6679
6680     } else {
6681         for(;;){
6682             int ret = decode_mb_cavlc(h);
6683
6684             if(ret>=0) hl_decode_mb(h);
6685
6686             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6687                 s->mb_y++;
6688                 ret = decode_mb_cavlc(h);
6689
6690                 if(ret>=0) hl_decode_mb(h);
6691                 s->mb_y--;
6692             }
6693
6694             if(ret<0){
6695                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6696                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6697
6698                 return -1;
6699             }
6700
6701             if(++s->mb_x >= s->mb_width){
6702                 s->mb_x=0;
6703                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6704                 ++s->mb_y;
6705                 if(FIELD_OR_MBAFF_PICTURE) {
6706                     ++s->mb_y;
6707                 }
6708                 if(s->mb_y >= s->mb_height){
6709                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6710
6711                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6712                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6713
6714                         return 0;
6715                     }else{
6716                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6717
6718                         return -1;
6719                     }
6720                 }
6721             }
6722
6723             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6724                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6725                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6726                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6727
6728                     return 0;
6729                 }else{
6730                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6731
6732                     return -1;
6733                 }
6734             }
6735         }
6736     }
6737
6738 #if 0
6739     for(;s->mb_y < s->mb_height; s->mb_y++){
6740         for(;s->mb_x < s->mb_width; s->mb_x++){
6741             int ret= decode_mb(h);
6742
6743             hl_decode_mb(h);
6744
6745             if(ret<0){
6746                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6747                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6748
6749                 return -1;
6750             }
6751
6752             if(++s->mb_x >= s->mb_width){
6753                 s->mb_x=0;
6754                 if(++s->mb_y >= s->mb_height){
6755                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6756                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6757
6758                         return 0;
6759                     }else{
6760                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6761
6762                         return -1;
6763                     }
6764                 }
6765             }
6766
6767             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6768                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6769                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6770
6771                     return 0;
6772                 }else{
6773                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6774
6775                     return -1;
6776                 }
6777             }
6778         }
6779         s->mb_x=0;
6780         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6781     }
6782 #endif
6783     return -1; //not reached
6784 }
6785
6786 static int decode_picture_timing(H264Context *h){
6787     MpegEncContext * const s = &h->s;
6788     if(h->sps.nal_hrd_parameters_present_flag || h->sps.vcl_hrd_parameters_present_flag){
6789         skip_bits(&s->gb, h->sps.cpb_removal_delay_length); /* cpb_removal_delay */
6790         skip_bits(&s->gb, h->sps.dpb_output_delay_length);  /* dpb_output_delay */
6791     }
6792     if(h->sps.pic_struct_present_flag){
6793         unsigned int i, num_clock_ts;
6794         h->sei_pic_struct = get_bits(&s->gb, 4);
6795
6796         if (h->sei_pic_struct > SEI_PIC_STRUCT_FRAME_TRIPLING)
6797             return -1;
6798
6799         num_clock_ts = sei_num_clock_ts_table[h->sei_pic_struct];
6800
6801         for (i = 0 ; i < num_clock_ts ; i++){
6802             if(get_bits(&s->gb, 1)){                  /* clock_timestamp_flag */
6803                 unsigned int full_timestamp_flag;
6804                 skip_bits(&s->gb, 2);                 /* ct_type */
6805                 skip_bits(&s->gb, 1);                 /* nuit_field_based_flag */
6806                 skip_bits(&s->gb, 5);                 /* counting_type */
6807                 full_timestamp_flag = get_bits(&s->gb, 1);
6808                 skip_bits(&s->gb, 1);                 /* discontinuity_flag */
6809                 skip_bits(&s->gb, 1);                 /* cnt_dropped_flag */
6810                 skip_bits(&s->gb, 8);                 /* n_frames */
6811                 if(full_timestamp_flag){
6812                     skip_bits(&s->gb, 6);             /* seconds_value 0..59 */
6813                     skip_bits(&s->gb, 6);             /* minutes_value 0..59 */
6814                     skip_bits(&s->gb, 5);             /* hours_value 0..23 */
6815                 }else{
6816                     if(get_bits(&s->gb, 1)){          /* seconds_flag */
6817                         skip_bits(&s->gb, 6);         /* seconds_value range 0..59 */
6818                         if(get_bits(&s->gb, 1)){      /* minutes_flag */
6819                             skip_bits(&s->gb, 6);     /* minutes_value 0..59 */
6820                             if(get_bits(&s->gb, 1))   /* hours_flag */
6821                                 skip_bits(&s->gb, 5); /* hours_value 0..23 */
6822                         }
6823                     }
6824                 }
6825                 if(h->sps.time_offset_length > 0)
6826                     skip_bits(&s->gb, h->sps.time_offset_length); /* time_offset */
6827             }
6828         }
6829     }
6830     return 0;
6831 }
6832
6833 static int decode_unregistered_user_data(H264Context *h, int size){
6834     MpegEncContext * const s = &h->s;
6835     uint8_t user_data[16+256];
6836     int e, build, i;
6837
6838     if(size<16)
6839         return -1;
6840
6841     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6842         user_data[i]= get_bits(&s->gb, 8);
6843     }
6844
6845     user_data[i]= 0;
6846     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6847     if(e==1 && build>=0)
6848         h->x264_build= build;
6849
6850     if(s->avctx->debug & FF_DEBUG_BUGS)
6851         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6852
6853     for(; i<size; i++)
6854         skip_bits(&s->gb, 8);
6855
6856     return 0;
6857 }
6858
6859 static int decode_sei(H264Context *h){
6860     MpegEncContext * const s = &h->s;
6861
6862     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6863         int size, type;
6864
6865         type=0;
6866         do{
6867             type+= show_bits(&s->gb, 8);
6868         }while(get_bits(&s->gb, 8) == 255);
6869
6870         size=0;
6871         do{
6872             size+= show_bits(&s->gb, 8);
6873         }while(get_bits(&s->gb, 8) == 255);
6874
6875         switch(type){
6876         case 1: // Picture timing SEI
6877             if(decode_picture_timing(h) < 0)
6878                 return -1;
6879             break;
6880         case 5:
6881             if(decode_unregistered_user_data(h, size) < 0)
6882                 return -1;
6883             break;
6884         default:
6885             skip_bits(&s->gb, 8*size);
6886         }
6887
6888         //FIXME check bits here
6889         align_get_bits(&s->gb);
6890     }
6891
6892     return 0;
6893 }
6894
6895 static inline int decode_hrd_parameters(H264Context *h, SPS *sps){
6896     MpegEncContext * const s = &h->s;
6897     int cpb_count, i;
6898     cpb_count = get_ue_golomb(&s->gb) + 1;
6899
6900     if(cpb_count > 32U){
6901         av_log(h->s.avctx, AV_LOG_ERROR, "cpb_count %d invalid\n", cpb_count);
6902         return -1;
6903     }
6904
6905     get_bits(&s->gb, 4); /* bit_rate_scale */
6906     get_bits(&s->gb, 4); /* cpb_size_scale */
6907     for(i=0; i<cpb_count; i++){
6908         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
6909         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
6910         get_bits1(&s->gb);     /* cbr_flag */
6911     }
6912     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
6913     sps->cpb_removal_delay_length = get_bits(&s->gb, 5) + 1;
6914     sps->dpb_output_delay_length = get_bits(&s->gb, 5) + 1;
6915     sps->time_offset_length = get_bits(&s->gb, 5);
6916     return 0;
6917 }
6918
6919 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
6920     MpegEncContext * const s = &h->s;
6921     int aspect_ratio_info_present_flag;
6922     unsigned int aspect_ratio_idc;
6923
6924     aspect_ratio_info_present_flag= get_bits1(&s->gb);
6925
6926     if( aspect_ratio_info_present_flag ) {
6927         aspect_ratio_idc= get_bits(&s->gb, 8);
6928         if( aspect_ratio_idc == EXTENDED_SAR ) {
6929             sps->sar.num= get_bits(&s->gb, 16);
6930             sps->sar.den= get_bits(&s->gb, 16);
6931         }else if(aspect_ratio_idc < FF_ARRAY_ELEMS(pixel_aspect)){
6932             sps->sar=  pixel_aspect[aspect_ratio_idc];
6933         }else{
6934             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
6935             return -1;
6936         }
6937     }else{
6938         sps->sar.num=
6939         sps->sar.den= 0;
6940     }
6941 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
6942
6943     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
6944         get_bits1(&s->gb);      /* overscan_appropriate_flag */
6945     }
6946
6947     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
6948         get_bits(&s->gb, 3);    /* video_format */
6949         get_bits1(&s->gb);      /* video_full_range_flag */
6950         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
6951             get_bits(&s->gb, 8); /* colour_primaries */
6952             get_bits(&s->gb, 8); /* transfer_characteristics */
6953             get_bits(&s->gb, 8); /* matrix_coefficients */
6954         }
6955     }
6956
6957     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
6958         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
6959         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
6960     }
6961
6962     sps->timing_info_present_flag = get_bits1(&s->gb);
6963     if(sps->timing_info_present_flag){
6964         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
6965         sps->time_scale = get_bits_long(&s->gb, 32);
6966         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
6967     }
6968
6969     sps->nal_hrd_parameters_present_flag = get_bits1(&s->gb);
6970     if(sps->nal_hrd_parameters_present_flag)
6971         if(decode_hrd_parameters(h, sps) < 0)
6972             return -1;
6973     sps->vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
6974     if(sps->vcl_hrd_parameters_present_flag)
6975         if(decode_hrd_parameters(h, sps) < 0)
6976             return -1;
6977     if(sps->nal_hrd_parameters_present_flag || sps->vcl_hrd_parameters_present_flag)
6978         get_bits1(&s->gb);     /* low_delay_hrd_flag */
6979     sps->pic_struct_present_flag = get_bits1(&s->gb);
6980
6981     sps->bitstream_restriction_flag = get_bits1(&s->gb);
6982     if(sps->bitstream_restriction_flag){
6983         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
6984         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
6985         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
6986         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
6987         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
6988         sps->num_reorder_frames= get_ue_golomb(&s->gb);
6989         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
6990
6991         if(sps->num_reorder_frames > 16U /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
6992             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", sps->num_reorder_frames);
6993             return -1;
6994         }
6995     }
6996
6997     return 0;
6998 }
6999
7000 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
7001                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
7002     MpegEncContext * const s = &h->s;
7003     int i, last = 8, next = 8;
7004     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
7005     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
7006         memcpy(factors, fallback_list, size*sizeof(uint8_t));
7007     else
7008     for(i=0;i<size;i++){
7009         if(next)
7010             next = (last + get_se_golomb(&s->gb)) & 0xff;
7011         if(!i && !next){ /* matrix not written, we use the preset one */
7012             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7013             break;
7014         }
7015         last = factors[scan[i]] = next ? next : last;
7016     }
7017 }
7018
7019 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7020                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7021     MpegEncContext * const s = &h->s;
7022     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7023     const uint8_t *fallback[4] = {
7024         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7025         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7026         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7027         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7028     };
7029     if(get_bits1(&s->gb)){
7030         sps->scaling_matrix_present |= is_sps;
7031         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7032         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7033         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7034         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7035         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7036         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7037         if(is_sps || pps->transform_8x8_mode){
7038             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7039             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7040         }
7041     }
7042 }
7043
7044 static inline int decode_seq_parameter_set(H264Context *h){
7045     MpegEncContext * const s = &h->s;
7046     int profile_idc, level_idc;
7047     unsigned int sps_id;
7048     int i;
7049     SPS *sps;
7050
7051     profile_idc= get_bits(&s->gb, 8);
7052     get_bits1(&s->gb);   //constraint_set0_flag
7053     get_bits1(&s->gb);   //constraint_set1_flag
7054     get_bits1(&s->gb);   //constraint_set2_flag
7055     get_bits1(&s->gb);   //constraint_set3_flag
7056     get_bits(&s->gb, 4); // reserved
7057     level_idc= get_bits(&s->gb, 8);
7058     sps_id= get_ue_golomb(&s->gb);
7059
7060     if(sps_id >= MAX_SPS_COUNT) {
7061         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id (%d) out of range\n", sps_id);
7062         return -1;
7063     }
7064     sps= av_mallocz(sizeof(SPS));
7065     if(sps == NULL)
7066         return -1;
7067
7068     sps->profile_idc= profile_idc;
7069     sps->level_idc= level_idc;
7070
7071     memset(sps->scaling_matrix4, 16, sizeof(sps->scaling_matrix4));
7072     memset(sps->scaling_matrix8, 16, sizeof(sps->scaling_matrix8));
7073     sps->scaling_matrix_present = 0;
7074
7075     if(sps->profile_idc >= 100){ //high profile
7076         sps->chroma_format_idc= get_ue_golomb(&s->gb);
7077         if(sps->chroma_format_idc == 3)
7078             get_bits1(&s->gb);  //residual_color_transform_flag
7079         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
7080         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
7081         sps->transform_bypass = get_bits1(&s->gb);
7082         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7083     }else{
7084         sps->chroma_format_idc= 1;
7085     }
7086
7087     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7088     sps->poc_type= get_ue_golomb(&s->gb);
7089
7090     if(sps->poc_type == 0){ //FIXME #define
7091         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7092     } else if(sps->poc_type == 1){//FIXME #define
7093         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7094         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7095         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7096         sps->poc_cycle_length                = get_ue_golomb(&s->gb);
7097
7098         if((unsigned)sps->poc_cycle_length >= FF_ARRAY_ELEMS(sps->offset_for_ref_frame)){
7099             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", sps->poc_cycle_length);
7100             goto fail;
7101         }
7102
7103         for(i=0; i<sps->poc_cycle_length; i++)
7104             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7105     }else if(sps->poc_type != 2){
7106         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7107         goto fail;
7108     }
7109
7110     sps->ref_frame_count= get_ue_golomb(&s->gb);
7111     if(sps->ref_frame_count > MAX_PICTURE_COUNT-2 || sps->ref_frame_count >= 32U){
7112         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7113         goto fail;
7114     }
7115     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7116     sps->mb_width = get_ue_golomb(&s->gb) + 1;
7117     sps->mb_height= get_ue_golomb(&s->gb) + 1;
7118     if((unsigned)sps->mb_width >= INT_MAX/16 || (unsigned)sps->mb_height >= INT_MAX/16 ||
7119        avcodec_check_dimensions(NULL, 16*sps->mb_width, 16*sps->mb_height)){
7120         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7121         goto fail;
7122     }
7123
7124     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7125     if(!sps->frame_mbs_only_flag)
7126         sps->mb_aff= get_bits1(&s->gb);
7127     else
7128         sps->mb_aff= 0;
7129
7130     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7131
7132 #ifndef ALLOW_INTERLACE
7133     if(sps->mb_aff)
7134         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7135 #endif
7136     sps->crop= get_bits1(&s->gb);
7137     if(sps->crop){
7138         sps->crop_left  = get_ue_golomb(&s->gb);
7139         sps->crop_right = get_ue_golomb(&s->gb);
7140         sps->crop_top   = get_ue_golomb(&s->gb);
7141         sps->crop_bottom= get_ue_golomb(&s->gb);
7142         if(sps->crop_left || sps->crop_top){
7143             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7144         }
7145         if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !sps->frame_mbs_only_flag)){
7146             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
7147         }
7148     }else{
7149         sps->crop_left  =
7150         sps->crop_right =
7151         sps->crop_top   =
7152         sps->crop_bottom= 0;
7153     }
7154
7155     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7156     if( sps->vui_parameters_present_flag )
7157         decode_vui_parameters(h, sps);
7158
7159     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7160         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s %s\n",
7161                sps_id, sps->profile_idc, sps->level_idc,
7162                sps->poc_type,
7163                sps->ref_frame_count,
7164                sps->mb_width, sps->mb_height,
7165                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7166                sps->direct_8x8_inference_flag ? "8B8" : "",
7167                sps->crop_left, sps->crop_right,
7168                sps->crop_top, sps->crop_bottom,
7169                sps->vui_parameters_present_flag ? "VUI" : "",
7170                ((const char*[]){"Gray","420","422","444"})[sps->chroma_format_idc]
7171                );
7172     }
7173     av_free(h->sps_buffers[sps_id]);
7174     h->sps_buffers[sps_id]= sps;
7175     return 0;
7176 fail:
7177     av_free(sps);
7178     return -1;
7179 }
7180
7181 static void
7182 build_qp_table(PPS *pps, int t, int index)
7183 {
7184     int i;
7185     for(i = 0; i < 52; i++)
7186         pps->chroma_qp_table[t][i] = chroma_qp[av_clip(i + index, 0, 51)];
7187 }
7188
7189 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7190     MpegEncContext * const s = &h->s;
7191     unsigned int pps_id= get_ue_golomb(&s->gb);
7192     PPS *pps;
7193
7194     if(pps_id >= MAX_PPS_COUNT) {
7195         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id (%d) out of range\n", pps_id);
7196         return -1;
7197     }
7198
7199     pps= av_mallocz(sizeof(PPS));
7200     if(pps == NULL)
7201         return -1;
7202     pps->sps_id= get_ue_golomb(&s->gb);
7203     if((unsigned)pps->sps_id>=MAX_SPS_COUNT || h->sps_buffers[pps->sps_id] == NULL){
7204         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7205         goto fail;
7206     }
7207
7208     pps->cabac= get_bits1(&s->gb);
7209     pps->pic_order_present= get_bits1(&s->gb);
7210     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7211     if(pps->slice_group_count > 1 ){
7212         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7213         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7214         switch(pps->mb_slice_group_map_type){
7215         case 0:
7216 #if 0
7217 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7218 |    run_length[ i ]                                |1  |ue(v)   |
7219 #endif
7220             break;
7221         case 2:
7222 #if 0
7223 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7224 |{                                                  |   |        |
7225 |    top_left_mb[ i ]                               |1  |ue(v)   |
7226 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7227 |   }                                               |   |        |
7228 #endif
7229             break;
7230         case 3:
7231         case 4:
7232         case 5:
7233 #if 0
7234 |   slice_group_change_direction_flag               |1  |u(1)    |
7235 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7236 #endif
7237             break;
7238         case 6:
7239 #if 0
7240 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7241 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7242 |)                                                  |   |        |
7243 |    slice_group_id[ i ]                            |1  |u(v)    |
7244 #endif
7245             break;
7246         }
7247     }
7248     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7249     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7250     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7251         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7252         goto fail;
7253     }
7254
7255     pps->weighted_pred= get_bits1(&s->gb);
7256     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7257     pps->init_qp= get_se_golomb(&s->gb) + 26;
7258     pps->init_qs= get_se_golomb(&s->gb) + 26;
7259     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7260     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7261     pps->constrained_intra_pred= get_bits1(&s->gb);
7262     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7263
7264     pps->transform_8x8_mode= 0;
7265     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7266     memcpy(pps->scaling_matrix4, h->sps_buffers[pps->sps_id]->scaling_matrix4, sizeof(pps->scaling_matrix4));
7267     memcpy(pps->scaling_matrix8, h->sps_buffers[pps->sps_id]->scaling_matrix8, sizeof(pps->scaling_matrix8));
7268
7269     if(get_bits_count(&s->gb) < bit_length){
7270         pps->transform_8x8_mode= get_bits1(&s->gb);
7271         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7272         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7273     } else {
7274         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7275     }
7276
7277     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7278     build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7279     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1])
7280         h->pps.chroma_qp_diff= 1;
7281
7282     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7283         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7284                pps_id, pps->sps_id,
7285                pps->cabac ? "CABAC" : "CAVLC",
7286                pps->slice_group_count,
7287                pps->ref_count[0], pps->ref_count[1],
7288                pps->weighted_pred ? "weighted" : "",
7289                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7290                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7291                pps->constrained_intra_pred ? "CONSTR" : "",
7292                pps->redundant_pic_cnt_present ? "REDU" : "",
7293                pps->transform_8x8_mode ? "8x8DCT" : ""
7294                );
7295     }
7296
7297     av_free(h->pps_buffers[pps_id]);
7298     h->pps_buffers[pps_id]= pps;
7299     return 0;
7300 fail:
7301     av_free(pps);
7302     return -1;
7303 }
7304
7305 /**
7306  * Call decode_slice() for each context.
7307  *
7308  * @param h h264 master context
7309  * @param context_count number of contexts to execute
7310  */
7311 static void execute_decode_slices(H264Context *h, int context_count){
7312     MpegEncContext * const s = &h->s;
7313     AVCodecContext * const avctx= s->avctx;
7314     H264Context *hx;
7315     int i;
7316
7317     if(context_count == 1) {
7318         decode_slice(avctx, &h);
7319     } else {
7320         for(i = 1; i < context_count; i++) {
7321             hx = h->thread_context[i];
7322             hx->s.error_recognition = avctx->error_recognition;
7323             hx->s.error_count = 0;
7324         }
7325
7326         avctx->execute(avctx, (void *)decode_slice,
7327                        (void **)h->thread_context, NULL, context_count, sizeof(void*));
7328
7329         /* pull back stuff from slices to master context */
7330         hx = h->thread_context[context_count - 1];
7331         s->mb_x = hx->s.mb_x;
7332         s->mb_y = hx->s.mb_y;
7333         s->dropable = hx->s.dropable;
7334         s->picture_structure = hx->s.picture_structure;
7335         for(i = 1; i < context_count; i++)
7336             h->s.error_count += h->thread_context[i]->s.error_count;
7337     }
7338 }
7339
7340
7341 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7342     MpegEncContext * const s = &h->s;
7343     AVCodecContext * const avctx= s->avctx;
7344     int buf_index=0;
7345     H264Context *hx; ///< thread context
7346     int context_count = 0;
7347
7348     h->max_contexts = avctx->thread_count;
7349 #if 0
7350     int i;
7351     for(i=0; i<50; i++){
7352         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7353     }
7354 #endif
7355     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7356         h->current_slice = 0;
7357         if (!s->first_field)
7358             s->current_picture_ptr= NULL;
7359     }
7360
7361     for(;;){
7362         int consumed;
7363         int dst_length;
7364         int bit_length;
7365         const uint8_t *ptr;
7366         int i, nalsize = 0;
7367         int err;
7368
7369         if(h->is_avc) {
7370             if(buf_index >= buf_size) break;
7371             nalsize = 0;
7372             for(i = 0; i < h->nal_length_size; i++)
7373                 nalsize = (nalsize << 8) | buf[buf_index++];
7374             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7375                 if(nalsize == 1){
7376                     buf_index++;
7377                     continue;
7378                 }else{
7379                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7380                     break;
7381                 }
7382             }
7383         } else {
7384             // start code prefix search
7385             for(; buf_index + 3 < buf_size; buf_index++){
7386                 // This should always succeed in the first iteration.
7387                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7388                     break;
7389             }
7390
7391             if(buf_index+3 >= buf_size) break;
7392
7393             buf_index+=3;
7394         }
7395
7396         hx = h->thread_context[context_count];
7397
7398         ptr= decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7399         if (ptr==NULL || dst_length < 0){
7400             return -1;
7401         }
7402         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7403             dst_length--;
7404         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
7405
7406         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7407             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7408         }
7409
7410         if (h->is_avc && (nalsize != consumed)){
7411             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7412             consumed= nalsize;
7413         }
7414
7415         buf_index += consumed;
7416
7417         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7418            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7419             continue;
7420
7421       again:
7422         err = 0;
7423         switch(hx->nal_unit_type){
7424         case NAL_IDR_SLICE:
7425             if (h->nal_unit_type != NAL_IDR_SLICE) {
7426                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7427                 return -1;
7428             }
7429             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7430         case NAL_SLICE:
7431             init_get_bits(&hx->s.gb, ptr, bit_length);
7432             hx->intra_gb_ptr=
7433             hx->inter_gb_ptr= &hx->s.gb;
7434             hx->s.data_partitioning = 0;
7435
7436             if((err = decode_slice_header(hx, h)))
7437                break;
7438
7439             s->current_picture_ptr->key_frame|= (hx->nal_unit_type == NAL_IDR_SLICE);
7440             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7441                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7442                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7443                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7444                && avctx->skip_frame < AVDISCARD_ALL)
7445                 context_count++;
7446             break;
7447         case NAL_DPA:
7448             init_get_bits(&hx->s.gb, ptr, bit_length);
7449             hx->intra_gb_ptr=
7450             hx->inter_gb_ptr= NULL;
7451             hx->s.data_partitioning = 1;
7452
7453             err = decode_slice_header(hx, h);
7454             break;
7455         case NAL_DPB:
7456             init_get_bits(&hx->intra_gb, ptr, bit_length);
7457             hx->intra_gb_ptr= &hx->intra_gb;
7458             break;
7459         case NAL_DPC:
7460             init_get_bits(&hx->inter_gb, ptr, bit_length);
7461             hx->inter_gb_ptr= &hx->inter_gb;
7462
7463             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7464                && s->context_initialized
7465                && s->hurry_up < 5
7466                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7467                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7468                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7469                && avctx->skip_frame < AVDISCARD_ALL)
7470                 context_count++;
7471             break;
7472         case NAL_SEI:
7473             init_get_bits(&s->gb, ptr, bit_length);
7474             decode_sei(h);
7475             break;
7476         case NAL_SPS:
7477             init_get_bits(&s->gb, ptr, bit_length);
7478             decode_seq_parameter_set(h);
7479
7480             if(s->flags& CODEC_FLAG_LOW_DELAY)
7481                 s->low_delay=1;
7482
7483             if(avctx->has_b_frames < 2)
7484                 avctx->has_b_frames= !s->low_delay;
7485             break;
7486         case NAL_PPS:
7487             init_get_bits(&s->gb, ptr, bit_length);
7488
7489             decode_picture_parameter_set(h, bit_length);
7490
7491             break;
7492         case NAL_AUD:
7493         case NAL_END_SEQUENCE:
7494         case NAL_END_STREAM:
7495         case NAL_FILLER_DATA:
7496         case NAL_SPS_EXT:
7497         case NAL_AUXILIARY_SLICE:
7498             break;
7499         default:
7500             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7501         }
7502
7503         if(context_count == h->max_contexts) {
7504             execute_decode_slices(h, context_count);
7505             context_count = 0;
7506         }
7507
7508         if (err < 0)
7509             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7510         else if(err == 1) {
7511             /* Slice could not be decoded in parallel mode, copy down
7512              * NAL unit stuff to context 0 and restart. Note that
7513              * rbsp_buffer is not transferred, but since we no longer
7514              * run in parallel mode this should not be an issue. */
7515             h->nal_unit_type = hx->nal_unit_type;
7516             h->nal_ref_idc   = hx->nal_ref_idc;
7517             hx = h;
7518             goto again;
7519         }
7520     }
7521     if(context_count)
7522         execute_decode_slices(h, context_count);
7523     return buf_index;
7524 }
7525
7526 /**
7527  * returns the number of bytes consumed for building the current frame
7528  */
7529 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7530         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7531         if(pos+10>buf_size) pos=buf_size; // oops ;)
7532
7533         return pos;
7534 }
7535
7536 static int decode_frame(AVCodecContext *avctx,
7537                              void *data, int *data_size,
7538                              const uint8_t *buf, int buf_size)
7539 {
7540     H264Context *h = avctx->priv_data;
7541     MpegEncContext *s = &h->s;
7542     AVFrame *pict = data;
7543     int buf_index;
7544
7545     s->flags= avctx->flags;
7546     s->flags2= avctx->flags2;
7547
7548    /* end of stream, output what is still in the buffers */
7549     if (buf_size == 0) {
7550         Picture *out;
7551         int i, out_idx;
7552
7553 //FIXME factorize this with the output code below
7554         out = h->delayed_pic[0];
7555         out_idx = 0;
7556         for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7557             if(h->delayed_pic[i]->poc < out->poc){
7558                 out = h->delayed_pic[i];
7559                 out_idx = i;
7560             }
7561
7562         for(i=out_idx; h->delayed_pic[i]; i++)
7563             h->delayed_pic[i] = h->delayed_pic[i+1];
7564
7565         if(out){
7566             *data_size = sizeof(AVFrame);
7567             *pict= *(AVFrame*)out;
7568         }
7569
7570         return 0;
7571     }
7572
7573     if(h->is_avc && !h->got_avcC) {
7574         int i, cnt, nalsize;
7575         unsigned char *p = avctx->extradata;
7576         if(avctx->extradata_size < 7) {
7577             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7578             return -1;
7579         }
7580         if(*p != 1) {
7581             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7582             return -1;
7583         }
7584         /* sps and pps in the avcC always have length coded with 2 bytes,
7585            so put a fake nal_length_size = 2 while parsing them */
7586         h->nal_length_size = 2;
7587         // Decode sps from avcC
7588         cnt = *(p+5) & 0x1f; // Number of sps
7589         p += 6;
7590         for (i = 0; i < cnt; i++) {
7591             nalsize = AV_RB16(p) + 2;
7592             if(decode_nal_units(h, p, nalsize) < 0) {
7593                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7594                 return -1;
7595             }
7596             p += nalsize;
7597         }
7598         // Decode pps from avcC
7599         cnt = *(p++); // Number of pps
7600         for (i = 0; i < cnt; i++) {
7601             nalsize = AV_RB16(p) + 2;
7602             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7603                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7604                 return -1;
7605             }
7606             p += nalsize;
7607         }
7608         // Now store right nal length size, that will be use to parse all other nals
7609         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7610         // Do not reparse avcC
7611         h->got_avcC = 1;
7612     }
7613
7614     if(!h->got_avcC && !h->is_avc && s->avctx->extradata_size){
7615         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7616             return -1;
7617         h->got_avcC = 1;
7618     }
7619
7620     buf_index=decode_nal_units(h, buf, buf_size);
7621     if(buf_index < 0)
7622         return -1;
7623
7624     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7625         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7626         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7627         return -1;
7628     }
7629
7630     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7631         Picture *out = s->current_picture_ptr;
7632         Picture *cur = s->current_picture_ptr;
7633         int i, pics, cross_idr, out_of_order, out_idx;
7634
7635         s->mb_y= 0;
7636
7637         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7638         s->current_picture_ptr->pict_type= s->pict_type;
7639
7640         if(!s->dropable) {
7641             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7642             h->prev_poc_msb= h->poc_msb;
7643             h->prev_poc_lsb= h->poc_lsb;
7644         }
7645         h->prev_frame_num_offset= h->frame_num_offset;
7646         h->prev_frame_num= h->frame_num;
7647
7648         /*
7649          * FIXME: Error handling code does not seem to support interlaced
7650          * when slices span multiple rows
7651          * The ff_er_add_slice calls don't work right for bottom
7652          * fields; they cause massive erroneous error concealing
7653          * Error marking covers both fields (top and bottom).
7654          * This causes a mismatched s->error_count
7655          * and a bad error table. Further, the error count goes to
7656          * INT_MAX when called for bottom field, because mb_y is
7657          * past end by one (callers fault) and resync_mb_y != 0
7658          * causes problems for the first MB line, too.
7659          */
7660         if (!FIELD_PICTURE)
7661             ff_er_frame_end(s);
7662
7663         MPV_frame_end(s);
7664
7665         if (cur->field_poc[0]==INT_MAX || cur->field_poc[1]==INT_MAX) {
7666             /* Wait for second field. */
7667             *data_size = 0;
7668
7669         } else {
7670             cur->repeat_pict = 0;
7671
7672             /* Signal interlacing information externally. */
7673             /* Prioritize picture timing SEI information over used decoding process if it exists. */
7674             if(h->sps.pic_struct_present_flag){
7675                 switch (h->sei_pic_struct)
7676                 {
7677                 case SEI_PIC_STRUCT_FRAME:
7678                     cur->interlaced_frame = 0;
7679                     break;
7680                 case SEI_PIC_STRUCT_TOP_FIELD:
7681                 case SEI_PIC_STRUCT_BOTTOM_FIELD:
7682                 case SEI_PIC_STRUCT_TOP_BOTTOM:
7683                 case SEI_PIC_STRUCT_BOTTOM_TOP:
7684                     cur->interlaced_frame = 1;
7685                     break;
7686                 case SEI_PIC_STRUCT_TOP_BOTTOM_TOP:
7687                 case SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM:
7688                     // Signal the possibility of telecined film externally (pic_struct 5,6)
7689                     // From these hints, let the applications decide if they apply deinterlacing.
7690                     cur->repeat_pict = 1;
7691                     cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7692                     break;
7693                 case SEI_PIC_STRUCT_FRAME_DOUBLING:
7694                     // Force progressive here, as doubling interlaced frame is a bad idea.
7695                     cur->interlaced_frame = 0;
7696                     cur->repeat_pict = 2;
7697                     break;
7698                 case SEI_PIC_STRUCT_FRAME_TRIPLING:
7699                     cur->interlaced_frame = 0;
7700                     cur->repeat_pict = 4;
7701                     break;
7702                 }
7703             }else{
7704                 /* Derive interlacing flag from used decoding process. */
7705                 cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7706             }
7707
7708             if (cur->field_poc[0] != cur->field_poc[1]){
7709                 /* Derive top_field_first from field pocs. */
7710                 cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7711             }else{
7712                 if(cur->interlaced_frame || h->sps.pic_struct_present_flag){
7713                     /* Use picture timing SEI information. Even if it is a information of a past frame, better than nothing. */
7714                     if(h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM
7715                       || h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM_TOP)
7716                         cur->top_field_first = 1;
7717                     else
7718                         cur->top_field_first = 0;
7719                 }else{
7720                     /* Most likely progressive */
7721                     cur->top_field_first = 0;
7722                 }
7723             }
7724
7725         //FIXME do something with unavailable reference frames
7726
7727             /* Sort B-frames into display order */
7728
7729             if(h->sps.bitstream_restriction_flag
7730                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7731                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7732                 s->low_delay = 0;
7733             }
7734
7735             if(   s->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT
7736                && !h->sps.bitstream_restriction_flag){
7737                 s->avctx->has_b_frames= MAX_DELAYED_PIC_COUNT;
7738                 s->low_delay= 0;
7739             }
7740
7741             pics = 0;
7742             while(h->delayed_pic[pics]) pics++;
7743
7744             assert(pics <= MAX_DELAYED_PIC_COUNT);
7745
7746             h->delayed_pic[pics++] = cur;
7747             if(cur->reference == 0)
7748                 cur->reference = DELAYED_PIC_REF;
7749
7750             out = h->delayed_pic[0];
7751             out_idx = 0;
7752             for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7753                 if(h->delayed_pic[i]->poc < out->poc){
7754                     out = h->delayed_pic[i];
7755                     out_idx = i;
7756                 }
7757             cross_idr = !h->delayed_pic[0]->poc || !!h->delayed_pic[i] || h->delayed_pic[0]->key_frame;
7758
7759             out_of_order = !cross_idr && out->poc < h->outputed_poc;
7760
7761             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7762                 { }
7763             else if((out_of_order && pics-1 == s->avctx->has_b_frames && s->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT)
7764                || (s->low_delay &&
7765                 ((!cross_idr && out->poc > h->outputed_poc + 2)
7766                  || cur->pict_type == FF_B_TYPE)))
7767             {
7768                 s->low_delay = 0;
7769                 s->avctx->has_b_frames++;
7770             }
7771
7772             if(out_of_order || pics > s->avctx->has_b_frames){
7773                 out->reference &= ~DELAYED_PIC_REF;
7774                 for(i=out_idx; h->delayed_pic[i]; i++)
7775                     h->delayed_pic[i] = h->delayed_pic[i+1];
7776             }
7777             if(!out_of_order && pics > s->avctx->has_b_frames){
7778                 *data_size = sizeof(AVFrame);
7779
7780                 h->outputed_poc = out->poc;
7781                 *pict= *(AVFrame*)out;
7782             }else{
7783                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7784             }
7785         }
7786     }
7787
7788     assert(pict->data[0] || !*data_size);
7789     ff_print_debug_info(s, pict);
7790 //printf("out %d\n", (int)pict->data[0]);
7791 #if 0 //?
7792
7793     /* Return the Picture timestamp as the frame number */
7794     /* we subtract 1 because it is added on utils.c     */
7795     avctx->frame_number = s->picture_number - 1;
7796 #endif
7797     return get_consumed_bytes(s, buf_index, buf_size);
7798 }
7799 #if 0
7800 static inline void fill_mb_avail(H264Context *h){
7801     MpegEncContext * const s = &h->s;
7802     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7803
7804     if(s->mb_y){
7805         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7806         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7807         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7808     }else{
7809         h->mb_avail[0]=
7810         h->mb_avail[1]=
7811         h->mb_avail[2]= 0;
7812     }
7813     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7814     h->mb_avail[4]= 1; //FIXME move out
7815     h->mb_avail[5]= 0; //FIXME move out
7816 }
7817 #endif
7818
7819 #ifdef TEST
7820 #undef printf
7821 #undef random
7822 #define COUNT 8000
7823 #define SIZE (COUNT*40)
7824 int main(void){
7825     int i;
7826     uint8_t temp[SIZE];
7827     PutBitContext pb;
7828     GetBitContext gb;
7829 //    int int_temp[10000];
7830     DSPContext dsp;
7831     AVCodecContext avctx;
7832
7833     dsputil_init(&dsp, &avctx);
7834
7835     init_put_bits(&pb, temp, SIZE);
7836     printf("testing unsigned exp golomb\n");
7837     for(i=0; i<COUNT; i++){
7838         START_TIMER
7839         set_ue_golomb(&pb, i);
7840         STOP_TIMER("set_ue_golomb");
7841     }
7842     flush_put_bits(&pb);
7843
7844     init_get_bits(&gb, temp, 8*SIZE);
7845     for(i=0; i<COUNT; i++){
7846         int j, s;
7847
7848         s= show_bits(&gb, 24);
7849
7850         START_TIMER
7851         j= get_ue_golomb(&gb);
7852         if(j != i){
7853             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7854 //            return -1;
7855         }
7856         STOP_TIMER("get_ue_golomb");
7857     }
7858
7859
7860     init_put_bits(&pb, temp, SIZE);
7861     printf("testing signed exp golomb\n");
7862     for(i=0; i<COUNT; i++){
7863         START_TIMER
7864         set_se_golomb(&pb, i - COUNT/2);
7865         STOP_TIMER("set_se_golomb");
7866     }
7867     flush_put_bits(&pb);
7868
7869     init_get_bits(&gb, temp, 8*SIZE);
7870     for(i=0; i<COUNT; i++){
7871         int j, s;
7872
7873         s= show_bits(&gb, 24);
7874
7875         START_TIMER
7876         j= get_se_golomb(&gb);
7877         if(j != i - COUNT/2){
7878             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7879 //            return -1;
7880         }
7881         STOP_TIMER("get_se_golomb");
7882     }
7883
7884 #if 0
7885     printf("testing 4x4 (I)DCT\n");
7886
7887     DCTELEM block[16];
7888     uint8_t src[16], ref[16];
7889     uint64_t error= 0, max_error=0;
7890
7891     for(i=0; i<COUNT; i++){
7892         int j;
7893 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
7894         for(j=0; j<16; j++){
7895             ref[j]= random()%255;
7896             src[j]= random()%255;
7897         }
7898
7899         h264_diff_dct_c(block, src, ref, 4);
7900
7901         //normalize
7902         for(j=0; j<16; j++){
7903 //            printf("%d ", block[j]);
7904             block[j]= block[j]*4;
7905             if(j&1) block[j]= (block[j]*4 + 2)/5;
7906             if(j&4) block[j]= (block[j]*4 + 2)/5;
7907         }
7908 //        printf("\n");
7909
7910         s->dsp.h264_idct_add(ref, block, 4);
7911 /*        for(j=0; j<16; j++){
7912             printf("%d ", ref[j]);
7913         }
7914         printf("\n");*/
7915
7916         for(j=0; j<16; j++){
7917             int diff= FFABS(src[j] - ref[j]);
7918
7919             error+= diff*diff;
7920             max_error= FFMAX(max_error, diff);
7921         }
7922     }
7923     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
7924     printf("testing quantizer\n");
7925     for(qp=0; qp<52; qp++){
7926         for(i=0; i<16; i++)
7927             src1_block[i]= src2_block[i]= random()%255;
7928
7929     }
7930     printf("Testing NAL layer\n");
7931
7932     uint8_t bitstream[COUNT];
7933     uint8_t nal[COUNT*2];
7934     H264Context h;
7935     memset(&h, 0, sizeof(H264Context));
7936
7937     for(i=0; i<COUNT; i++){
7938         int zeros= i;
7939         int nal_length;
7940         int consumed;
7941         int out_length;
7942         uint8_t *out;
7943         int j;
7944
7945         for(j=0; j<COUNT; j++){
7946             bitstream[j]= (random() % 255) + 1;
7947         }
7948
7949         for(j=0; j<zeros; j++){
7950             int pos= random() % COUNT;
7951             while(bitstream[pos] == 0){
7952                 pos++;
7953                 pos %= COUNT;
7954             }
7955             bitstream[pos]=0;
7956         }
7957
7958         START_TIMER
7959
7960         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
7961         if(nal_length<0){
7962             printf("encoding failed\n");
7963             return -1;
7964         }
7965
7966         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
7967
7968         STOP_TIMER("NAL")
7969
7970         if(out_length != COUNT){
7971             printf("incorrect length %d %d\n", out_length, COUNT);
7972             return -1;
7973         }
7974
7975         if(consumed != nal_length){
7976             printf("incorrect consumed length %d %d\n", nal_length, consumed);
7977             return -1;
7978         }
7979
7980         if(memcmp(bitstream, out, COUNT)){
7981             printf("mismatch\n");
7982             return -1;
7983         }
7984     }
7985 #endif
7986
7987     printf("Testing RBSP\n");
7988
7989
7990     return 0;
7991 }
7992 #endif /* TEST */
7993
7994
7995 static av_cold int decode_end(AVCodecContext *avctx)
7996 {
7997     H264Context *h = avctx->priv_data;
7998     MpegEncContext *s = &h->s;
7999     int i;
8000
8001     av_freep(&h->rbsp_buffer[0]);
8002     av_freep(&h->rbsp_buffer[1]);
8003     free_tables(h); //FIXME cleanup init stuff perhaps
8004
8005     for(i = 0; i < MAX_SPS_COUNT; i++)
8006         av_freep(h->sps_buffers + i);
8007
8008     for(i = 0; i < MAX_PPS_COUNT; i++)
8009         av_freep(h->pps_buffers + i);
8010
8011     MPV_common_end(s);
8012
8013 //    memset(h, 0, sizeof(H264Context));
8014
8015     return 0;
8016 }
8017
8018
8019 AVCodec h264_decoder = {
8020     "h264",
8021     CODEC_TYPE_VIDEO,
8022     CODEC_ID_H264,
8023     sizeof(H264Context),
8024     decode_init,
8025     NULL,
8026     decode_end,
8027     decode_frame,
8028     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_DELAY,
8029     .flush= flush_dpb,
8030     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
8031 };
8032
8033 #include "svq3.c"