libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264.h"
  32 #include "h264data.h"
  33 #include "h264_parser.h"
  34 #include "golomb.h"
  35 #include "rectangle.h"
  36
  37 #include "cabac.h"
  38 #ifdef ARCH_X86
  39 #include "i386/h264_i386.h"
  40 #endif
  41
  42 //#undef NDEBUG
  43 #include <assert.h>
  44
  45 /**
  46  * Value of Picture.reference when Picture is not a reference picture, but
  47  * is held for delayed output.
  48  */
  49 #define DELAYED_PIC_REF 4
  50
  51 static VLC coeff_token_vlc[4];
  52 static VLC_TYPE coeff_token_vlc_tables[520+332+280+256][2];
  53 static const int coeff_token_vlc_tables_size[4]={520,332,280,256};
  54
  55 static VLC chroma_dc_coeff_token_vlc;
  56 static VLC_TYPE chroma_dc_coeff_token_vlc_table[256][2];
  57 static const int chroma_dc_coeff_token_vlc_table_size = 256;
  58
  59 static VLC total_zeros_vlc[15];
  60 static VLC_TYPE total_zeros_vlc_tables[15][512][2];
  61 static const int total_zeros_vlc_tables_size = 512;
  62
  63 static VLC chroma_dc_total_zeros_vlc[3];
  64 static VLC_TYPE chroma_dc_total_zeros_vlc_tables[3][8][2];
  65 static const int chroma_dc_total_zeros_vlc_tables_size = 8;
  66
  67 static VLC run_vlc[6];
  68 static VLC_TYPE run_vlc_tables[6][8][2];
  69 static const int run_vlc_tables_size = 8;
  70
  71 static VLC run7_vlc;
  72 static VLC_TYPE run7_vlc_table[96][2];
  73 static const int run7_vlc_table_size = 96;
  74
  75 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  76 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  77 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  78 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  79 static Picture * remove_long(H264Context *h, int i, int ref_mask);
  80
  81 static av_always_inline uint32_t pack16to32(int a, int b){
  82 #ifdef WORDS_BIGENDIAN
  83    return (b&0xFFFF) + (a<<16);
  84 #else
  85    return (a&0xFFFF) + (b<<16);
  86 #endif
  87 }
  88
  89 static const uint8_t rem6[52]={
  90 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  91 };
  92
  93 static const uint8_t div6[52]={
  94 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  95 };
  96
  97 static const int left_block_options[4][8]={
  98     {0,1,2,3,7,10,8,11},
  99     {2,2,3,3,8,11,8,11},
 100     {0,0,1,1,7,10,7,10},
 101     {0,2,0,2,7,10,7,10}
 102 };
 103
 104 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 105     MpegEncContext * const s = &h->s;
 106     const int mb_xy= h->mb_xy;
 107     int topleft_xy, top_xy, topright_xy, left_xy[2];
 108     int topleft_type, top_type, topright_type, left_type[2];
 109     const int * left_block;
 110     int topleft_partition= -1;
 111     int i;
 112
 113     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
 114
 115     //FIXME deblocking could skip the intra and nnz parts.
 116     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
 117         return;
 118
 119     /* Wow, what a mess, why didn't they simplify the interlacing & intra
 120      * stuff, I can't imagine that these complex rules are worth it. */
 121
 122     topleft_xy = top_xy - 1;
 123     topright_xy= top_xy + 1;
 124     left_xy[1] = left_xy[0] = mb_xy-1;
 125     left_block = left_block_options[0];
 126     if(FRAME_MBAFF){
 127         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 128         const int top_pair_xy      = pair_xy     - s->mb_stride;
 129         const int topleft_pair_xy  = top_pair_xy - 1;
 130         const int topright_pair_xy = top_pair_xy + 1;
 131         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 132         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 133         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 134         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 135         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 136         const int bottom = (s->mb_y & 1);
 137         tprintf(s->avctx, "fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 138         if (bottom
 139                 ? !curr_mb_frame_flag // bottom macroblock
 140                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 141                 ) {
 142             top_xy -= s->mb_stride;
 143         }
 144         if (bottom
 145                 ? !curr_mb_frame_flag // bottom macroblock
 146                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 147                 ) {
 148             topleft_xy -= s->mb_stride;
 149         } else if(bottom && curr_mb_frame_flag && !left_mb_frame_flag) {
 150             topleft_xy += s->mb_stride;
 151             // take top left mv from the middle of the mb, as opposed to all other modes which use the bottom right partition
 152             topleft_partition = 0;
 153         }
 154         if (bottom
 155                 ? !curr_mb_frame_flag // bottom macroblock
 156                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 157                 ) {
 158             topright_xy -= s->mb_stride;
 159         }
 160         if (left_mb_frame_flag != curr_mb_frame_flag) {
 161             left_xy[1] = left_xy[0] = pair_xy - 1;
 162             if (curr_mb_frame_flag) {
 163                 if (bottom) {
 164                     left_block = left_block_options[1];
 165                 } else {
 166                     left_block= left_block_options[2];
 167                 }
 168             } else {
 169                 left_xy[1] += s->mb_stride;
 170                 left_block = left_block_options[3];
 171             }
 172         }
 173     }
 174
 175     h->top_mb_xy = top_xy;
 176     h->left_mb_xy[0] = left_xy[0];
 177     h->left_mb_xy[1] = left_xy[1];
 178     if(for_deblock){
 179         topleft_type = 0;
 180         topright_type = 0;
 181         top_type     = h->slice_table[top_xy     ] < 0xFFFF ? s->current_picture.mb_type[top_xy]     : 0;
 182         left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0;
 183         left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0;
 184
 185         if(MB_MBAFF && !IS_INTRA(mb_type)){
 186             int list;
 187             for(list=0; list<h->list_count; list++){
 188                 //These values where changed for ease of performing MC, we need to change them back
 189                 //FIXME maybe we can make MC and loop filter use the same values or prevent
 190                 //the MC code from changing ref_cache and rather use a temporary array.
 191                 if(USES_LIST(mb_type,list)){
 192                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 193                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 194                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 195                     ref += h->b8_stride;
 196                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 197                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 198                 }
 199             }
 200         }
 201     }else{
 202         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 203         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 204         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 205         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 206         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 207
 208     if(IS_INTRA(mb_type)){
 209         int type_mask= h->pps.constrained_intra_pred ? IS_INTRA(-1) : -1;
 210         h->topleft_samples_available=
 211         h->top_samples_available=
 212         h->left_samples_available= 0xFFFF;
 213         h->topright_samples_available= 0xEEEA;
 214
 215         if(!(top_type & type_mask)){
 216             h->topleft_samples_available= 0xB3FF;
 217             h->top_samples_available= 0x33FF;
 218             h->topright_samples_available= 0x26EA;
 219         }
 220         if(IS_INTERLACED(mb_type) != IS_INTERLACED(left_type[0])){
 221             if(IS_INTERLACED(mb_type)){
 222                 if(!(left_type[0] & type_mask)){
 223                     h->topleft_samples_available&= 0xDFFF;
 224                     h->left_samples_available&= 0x5FFF;
 225                 }
 226                 if(!(left_type[1] & type_mask)){
 227                     h->topleft_samples_available&= 0xFF5F;
 228                     h->left_samples_available&= 0xFF5F;
 229                 }
 230             }else{
 231                 int left_typei = h->slice_table[left_xy[0] + s->mb_stride ] == h->slice_num
 232                                 ? s->current_picture.mb_type[left_xy[0] + s->mb_stride] : 0;
 233                 assert(left_xy[0] == left_xy[1]);
 234                 if(!((left_typei & type_mask) && (left_type[0] & type_mask))){
 235                     h->topleft_samples_available&= 0xDF5F;
 236                     h->left_samples_available&= 0x5F5F;
 237                 }
 238             }
 239         }else{
 240             if(!(left_type[0] & type_mask)){
 241                 h->topleft_samples_available&= 0xDF5F;
 242                 h->left_samples_available&= 0x5F5F;
 243             }
 244         }
 245
 246         if(!(topleft_type & type_mask))
 247             h->topleft_samples_available&= 0x7FFF;
 248
 249         if(!(topright_type & type_mask))
 250             h->topright_samples_available&= 0xFBFF;
 251
 252         if(IS_INTRA4x4(mb_type)){
 253             if(IS_INTRA4x4(top_type)){
 254                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 255                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 256                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 257                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 258             }else{
 259                 int pred;
 260                 if(!(top_type & type_mask))
 261                     pred= -1;
 262                 else{
 263                     pred= 2;
 264                 }
 265                 h->intra4x4_pred_mode_cache[4+8*0]=
 266                 h->intra4x4_pred_mode_cache[5+8*0]=
 267                 h->intra4x4_pred_mode_cache[6+8*0]=
 268                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 269             }
 270             for(i=0; i<2; i++){
 271                 if(IS_INTRA4x4(left_type[i])){
 272                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 273                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 274                 }else{
 275                     int pred;
 276                     if(!(left_type[i] & type_mask))
 277                         pred= -1;
 278                     else{
 279                         pred= 2;
 280                     }
 281                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 282                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 283                 }
 284             }
 285         }
 286     }
 287     }
 288
 289
 290 /*
 291 0 . T T. T T T T
 292 1 L . .L . . . .
 293 2 L . .L . . . .
 294 3 . T TL . . . .
 295 4 L . .L . . . .
 296 5 L . .. . . . .
 297 */
 298 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
 299     if(top_type){
 300         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 301         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 302         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 303         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 304
 305         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 306         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 307
 308         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 309         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 310
 311     }else{
 312         h->non_zero_count_cache[4+8*0]=
 313         h->non_zero_count_cache[5+8*0]=
 314         h->non_zero_count_cache[6+8*0]=
 315         h->non_zero_count_cache[7+8*0]=
 316
 317         h->non_zero_count_cache[1+8*0]=
 318         h->non_zero_count_cache[2+8*0]=
 319
 320         h->non_zero_count_cache[1+8*3]=
 321         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 322
 323     }
 324
 325     for (i=0; i<2; i++) {
 326         if(left_type[i]){
 327             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 328             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 329             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 330             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 331         }else{
 332             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 333             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 334             h->non_zero_count_cache[0+8*1 +   8*i]=
 335             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 336         }
 337     }
 338
 339     if( h->pps.cabac ) {
 340         // top_cbp
 341         if(top_type) {
 342             h->top_cbp = h->cbp_table[top_xy];
 343         } else if(IS_INTRA(mb_type)) {
 344             h->top_cbp = 0x1C0;
 345         } else {
 346             h->top_cbp = 0;
 347         }
 348         // left_cbp
 349         if (left_type[0]) {
 350             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 351         } else if(IS_INTRA(mb_type)) {
 352             h->left_cbp = 0x1C0;
 353         } else {
 354             h->left_cbp = 0;
 355         }
 356         if (left_type[0]) {
 357             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 358         }
 359         if (left_type[1]) {
 360             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 361         }
 362     }
 363
 364 #if 1
 365     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 366         int list;
 367         for(list=0; list<h->list_count; list++){
 368             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 369                 /*if(!h->mv_cache_clean[list]){
 370                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 371                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 372                     h->mv_cache_clean[list]= 1;
 373                 }*/
 374                 continue;
 375             }
 376             h->mv_cache_clean[list]= 0;
 377
 378             if(USES_LIST(top_type, list)){
 379                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 380                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 381                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 382                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 383                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 384                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 385                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 386                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 387                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 388                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 389             }else{
 390                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 391                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 392                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 393                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 394                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 395             }
 396
 397             for(i=0; i<2; i++){
 398                 int cache_idx = scan8[0] - 1 + i*2*8;
 399                 if(USES_LIST(left_type[i], list)){
 400                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 401                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 402                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 403                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 404                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 405                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 406                 }else{
 407                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 408                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 409                     h->ref_cache[list][cache_idx  ]=
 410                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 411                 }
 412             }
 413
 414             if(for_deblock || ((IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred) && !FRAME_MBAFF))
 415                 continue;
 416
 417             if(USES_LIST(topleft_type, list)){
 418                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 419                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 420                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 421                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 422             }else{
 423                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 424                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 425             }
 426
 427             if(USES_LIST(topright_type, list)){
 428                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 429                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 430                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 431                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 432             }else{
 433                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 434                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 435             }
 436
 437             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 438                 continue;
 439
 440             h->ref_cache[list][scan8[5 ]+1] =
 441             h->ref_cache[list][scan8[7 ]+1] =
 442             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 443             h->ref_cache[list][scan8[4 ]] =
 444             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 445             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 446             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 447             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 448             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 449             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 450
 451             if( h->pps.cabac ) {
 452                 /* XXX beurk, Load mvd */
 453                 if(USES_LIST(top_type, list)){
 454                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 455                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 456                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 457                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 458                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 459                 }else{
 460                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 461                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 462                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 463                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 464                 }
 465                 if(USES_LIST(left_type[0], list)){
 466                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 467                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 468                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 469                 }else{
 470                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 471                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 472                 }
 473                 if(USES_LIST(left_type[1], list)){
 474                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 475                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 476                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 477                 }else{
 478                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 479                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 480                 }
 481                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 482                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 483                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 484                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 485                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 486
 487                 if(h->slice_type_nos == FF_B_TYPE){
 488                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 489
 490                     if(IS_DIRECT(top_type)){
 491                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 492                     }else if(IS_8X8(top_type)){
 493                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 494                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 495                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 496                     }else{
 497                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 498                     }
 499
 500                     if(IS_DIRECT(left_type[0]))
 501                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 502                     else if(IS_8X8(left_type[0]))
 503                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 504                     else
 505                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 506
 507                     if(IS_DIRECT(left_type[1]))
 508                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 509                     else if(IS_8X8(left_type[1]))
 510                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 511                     else
 512                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 513                 }
 514             }
 515
 516             if(FRAME_MBAFF){
 517 #define MAP_MVS\
 518                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 519                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 520                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 521                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 522                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 523                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 524                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 525                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 526                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 527                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 528                 if(MB_FIELD){
 529 #define MAP_F2F(idx, mb_type)\
 530                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 531                         h->ref_cache[list][idx] <<= 1;\
 532                         h->mv_cache[list][idx][1] /= 2;\
 533                         h->mvd_cache[list][idx][1] /= 2;\
 534                     }
 535                     MAP_MVS
 536 #undef MAP_F2F
 537                 }else{
 538 #define MAP_F2F(idx, mb_type)\
 539                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 540                         h->ref_cache[list][idx] >>= 1;\
 541                         h->mv_cache[list][idx][1] <<= 1;\
 542                         h->mvd_cache[list][idx][1] <<= 1;\
 543                     }
 544                     MAP_MVS
 545 #undef MAP_F2F
 546                 }
 547             }
 548         }
 549     }
 550 #endif
 551
 552     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 553 }
 554
 555 static inline void write_back_intra_pred_mode(H264Context *h){
 556     const int mb_xy= h->mb_xy;
 557
 558     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 559     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 560     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 561     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 562     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 563     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 564     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 565 }
 566
 567 /**
 568  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 569  */
 570 static inline int check_intra4x4_pred_mode(H264Context *h){
 571     MpegEncContext * const s = &h->s;
 572     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 573     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 574     int i;
 575
 576     if(!(h->top_samples_available&0x8000)){
 577         for(i=0; i<4; i++){
 578             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 579             if(status<0){
 580                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 581                 return -1;
 582             } else if(status){
 583                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 584             }
 585         }
 586     }
 587
 588     if((h->left_samples_available&0x8888)!=0x8888){
 589         static const int mask[4]={0x8000,0x2000,0x80,0x20};
 590         for(i=0; i<4; i++){
 591             if(!(h->left_samples_available&mask[i])){
 592             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 593             if(status<0){
 594                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 595                 return -1;
 596             } else if(status){
 597                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 598             }
 599             }
 600         }
 601     }
 602
 603     return 0;
 604 } //FIXME cleanup like next
 605
 606 /**
 607  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 608  */
 609 static inline int check_intra_pred_mode(H264Context *h, int mode){
 610     MpegEncContext * const s = &h->s;
 611     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 612     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 613
 614     if(mode > 6U) {
 615         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 616         return -1;
 617     }
 618
 619     if(!(h->top_samples_available&0x8000)){
 620         mode= top[ mode ];
 621         if(mode<0){
 622             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 623             return -1;
 624         }
 625     }
 626
 627     if((h->left_samples_available&0x8080) != 0x8080){
 628         mode= left[ mode ];
 629         if(h->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred
 630             mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(h->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8);
 631         }
 632         if(mode<0){
 633             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 634             return -1;
 635         }
 636     }
 637
 638     return mode;
 639 }
 640
 641 /**
 642  * gets the predicted intra4x4 prediction mode.
 643  */
 644 static inline int pred_intra_mode(H264Context *h, int n){
 645     const int index8= scan8[n];
 646     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 647     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 648     const int min= FFMIN(left, top);
 649
 650     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 651
 652     if(min<0) return DC_PRED;
 653     else      return min;
 654 }
 655
 656 static inline void write_back_non_zero_count(H264Context *h){
 657     const int mb_xy= h->mb_xy;
 658
 659     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 660     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 661     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 662     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 663     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 664     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 665     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 666
 667     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 668     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 669     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 670
 671     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 672     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 673     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 674 }
 675
 676 /**
 677  * gets the predicted number of non-zero coefficients.
 678  * @param n block index
 679  */
 680 static inline int pred_non_zero_count(H264Context *h, int n){
 681     const int index8= scan8[n];
 682     const int left= h->non_zero_count_cache[index8 - 1];
 683     const int top = h->non_zero_count_cache[index8 - 8];
 684     int i= left + top;
 685
 686     if(i<64) i= (i+1)>>1;
 687
 688     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 689
 690     return i&31;
 691 }
 692
 693 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 694     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 695     MpegEncContext *s = &h->s;
 696
 697     /* there is no consistent mapping of mvs to neighboring locations that will
 698      * make mbaff happy, so we can't move all this logic to fill_caches */
 699     if(FRAME_MBAFF){
 700         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 701         const int16_t *mv;
 702         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 703         *C = h->mv_cache[list][scan8[0]-2];
 704
 705         if(!MB_FIELD
 706            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 707             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 708             if(IS_INTERLACED(mb_types[topright_xy])){
 709 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 710                 const int x4 = X4, y4 = Y4;\
 711                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 712                 if(!USES_LIST(mb_type,list))\
 713                     return LIST_NOT_USED;\
 714                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 715                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 716                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 717                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 718
 719                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 720             }
 721         }
 722         if(topright_ref == PART_NOT_AVAILABLE
 723            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 724            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 725             if(!MB_FIELD
 726                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 727                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 728             }
 729             if(MB_FIELD
 730                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 731                && i >= scan8[0]+8){
 732                 // left shift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's OK.
 733                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 734             }
 735         }
 736 #undef SET_DIAG_MV
 737     }
 738
 739     if(topright_ref != PART_NOT_AVAILABLE){
 740         *C= h->mv_cache[list][ i - 8 + part_width ];
 741         return topright_ref;
 742     }else{
 743         tprintf(s->avctx, "topright MV not available\n");
 744
 745         *C= h->mv_cache[list][ i - 8 - 1 ];
 746         return h->ref_cache[list][ i - 8 - 1 ];
 747     }
 748 }
 749
 750 /**
 751  * gets the predicted MV.
 752  * @param n the block index
 753  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 754  * @param mx the x component of the predicted motion vector
 755  * @param my the y component of the predicted motion vector
 756  */
 757 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 758     const int index8= scan8[n];
 759     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 760     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 761     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 762     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 763     const int16_t * C;
 764     int diagonal_ref, match_count;
 765
 766     assert(part_width==1 || part_width==2 || part_width==4);
 767
 768 /* mv_cache
 769   B . . A T T T T
 770   U . . L . . , .
 771   U . . L . . . .
 772   U . . L . . , .
 773   . . . L . . . .
 774 */
 775
 776     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 777     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 778     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 779     if(match_count > 1){ //most common
 780         *mx= mid_pred(A[0], B[0], C[0]);
 781         *my= mid_pred(A[1], B[1], C[1]);
 782     }else if(match_count==1){
 783         if(left_ref==ref){
 784             *mx= A[0];
 785             *my= A[1];
 786         }else if(top_ref==ref){
 787             *mx= B[0];
 788             *my= B[1];
 789         }else{
 790             *mx= C[0];
 791             *my= C[1];
 792         }
 793     }else{
 794         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 795             *mx= A[0];
 796             *my= A[1];
 797         }else{
 798             *mx= mid_pred(A[0], B[0], C[0]);
 799             *my= mid_pred(A[1], B[1], C[1]);
 800         }
 801     }
 802
 803     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 804 }
 805
 806 /**
 807  * gets the directionally predicted 16x8 MV.
 808  * @param n the block index
 809  * @param mx the x component of the predicted motion vector
 810  * @param my the y component of the predicted motion vector
 811  */
 812 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 813     if(n==0){
 814         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 815         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 816
 817         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 818
 819         if(top_ref == ref){
 820             *mx= B[0];
 821             *my= B[1];
 822             return;
 823         }
 824     }else{
 825         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 826         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 827
 828         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 829
 830         if(left_ref == ref){
 831             *mx= A[0];
 832             *my= A[1];
 833             return;
 834         }
 835     }
 836
 837     //RARE
 838     pred_motion(h, n, 4, list, ref, mx, my);
 839 }
 840
 841 /**
 842  * gets the directionally predicted 8x16 MV.
 843  * @param n the block index
 844  * @param mx the x component of the predicted motion vector
 845  * @param my the y component of the predicted motion vector
 846  */
 847 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 848     if(n==0){
 849         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 850         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 851
 852         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 853
 854         if(left_ref == ref){
 855             *mx= A[0];
 856             *my= A[1];
 857             return;
 858         }
 859     }else{
 860         const int16_t * C;
 861         int diagonal_ref;
 862
 863         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 864
 865         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 866
 867         if(diagonal_ref == ref){
 868             *mx= C[0];
 869             *my= C[1];
 870             return;
 871         }
 872     }
 873
 874     //RARE
 875     pred_motion(h, n, 2, list, ref, mx, my);
 876 }
 877
 878 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 879     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 880     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 881
 882     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 883
 884     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 885        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
 886        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
 887
 888         *mx = *my = 0;
 889         return;
 890     }
 891
 892     pred_motion(h, 0, 4, 0, 0, mx, my);
 893
 894     return;
 895 }
 896
 897 static int get_scale_factor(H264Context * const h, int poc, int poc1, int i){
 898     int poc0 = h->ref_list[0][i].poc;
 899     int td = av_clip(poc1 - poc0, -128, 127);
 900     if(td == 0 || h->ref_list[0][i].long_ref){
 901         return 256;
 902     }else{
 903         int tb = av_clip(poc - poc0, -128, 127);
 904         int tx = (16384 + (FFABS(td) >> 1)) / td;
 905         return av_clip((tb*tx + 32) >> 6, -1024, 1023);
 906     }
 907 }
 908
 909 static inline void direct_dist_scale_factor(H264Context * const h){
 910     MpegEncContext * const s = &h->s;
 911     const int poc = h->s.current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
 912     const int poc1 = h->ref_list[1][0].poc;
 913     int i, field;
 914     for(field=0; field<2; field++){
 915         const int poc  = h->s.current_picture_ptr->field_poc[field];
 916         const int poc1 = h->ref_list[1][0].field_poc[field];
 917         for(i=0; i < 2*h->ref_count[0]; i++)
 918             h->dist_scale_factor_field[field][i^field] = get_scale_factor(h, poc, poc1, i+16);
 919     }
 920
 921     for(i=0; i<h->ref_count[0]; i++){
 922         h->dist_scale_factor[i] = get_scale_factor(h, poc, poc1, i);
 923     }
 924 }
 925
 926 static void fill_colmap(H264Context *h, int map[2][16+32], int list, int field, int colfield, int mbafi){
 927     MpegEncContext * const s = &h->s;
 928     Picture * const ref1 = &h->ref_list[1][0];
 929     int j, old_ref, rfield;
 930     int start= mbafi ? 16                      : 0;
 931     int end  = mbafi ? 16+2*h->ref_count[list] : h->ref_count[list];
 932     int interl= mbafi || s->picture_structure != PICT_FRAME;
 933
 934     /* bogus; fills in for missing frames */
 935     memset(map[list], 0, sizeof(map[list]));
 936
 937     for(rfield=0; rfield<2; rfield++){
 938         for(old_ref=0; old_ref<ref1->ref_count[colfield][list]; old_ref++){
 939             int poc = ref1->ref_poc[colfield][list][old_ref];
 940
 941             if     (!interl)
 942                 poc |= 3;
 943             else if( interl && (poc&3) == 3) //FIXME store all MBAFF references so this isnt needed
 944                 poc= (poc&~3) + rfield + 1;
 945
 946             for(j=start; j<end; j++){
 947                 if(4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3) == poc){
 948                     int cur_ref= mbafi ? (j-16)^field : j;
 949                     map[list][2*old_ref + (rfield^field) + 16] = cur_ref;
 950                     if(rfield == field)
 951                         map[list][old_ref] = cur_ref;
 952                     break;
 953                 }
 954             }
 955         }
 956     }
 957 }
 958
 959 static inline void direct_ref_list_init(H264Context * const h){
 960     MpegEncContext * const s = &h->s;
 961     Picture * const ref1 = &h->ref_list[1][0];
 962     Picture * const cur = s->current_picture_ptr;
 963     int list, j, field;
 964     int sidx= (s->picture_structure&1)^1;
 965     int ref1sidx= (ref1->reference&1)^1;
 966
 967     for(list=0; list<2; list++){
 968         cur->ref_count[sidx][list] = h->ref_count[list];
 969         for(j=0; j<h->ref_count[list]; j++)
 970             cur->ref_poc[sidx][list][j] = 4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3);
 971     }
 972
 973     if(s->picture_structure == PICT_FRAME){
 974         memcpy(cur->ref_count[1], cur->ref_count[0], sizeof(cur->ref_count[0]));
 975         memcpy(cur->ref_poc  [1], cur->ref_poc  [0], sizeof(cur->ref_poc  [0]));
 976     }
 977
 978     cur->mbaff= FRAME_MBAFF;
 979
 980     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 981         return;
 982
 983     for(list=0; list<2; list++){
 984         fill_colmap(h, h->map_col_to_list0, list, sidx, ref1sidx, 0);
 985         for(field=0; field<2; field++)
 986             fill_colmap(h, h->map_col_to_list0_field[field], list, field, field, 1);
 987     }
 988 }
 989
 990 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 991     MpegEncContext * const s = &h->s;
 992     int b8_stride = h->b8_stride;
 993     int b4_stride = h->b_stride;
 994     int mb_xy = h->mb_xy;
 995     int mb_type_col[2];
 996     const int16_t (*l1mv0)[2], (*l1mv1)[2];
 997     const int8_t *l1ref0, *l1ref1;
 998     const int is_b8x8 = IS_8X8(*mb_type);
 999     unsigned int sub_mb_type;
1000     int i8, i4;
1001
1002 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
1003
1004     if(IS_INTERLACED(h->ref_list[1][0].mb_type[mb_xy])){ // AFL/AFR/FR/FL -> AFL/FL
1005         if(!IS_INTERLACED(*mb_type)){                    //     AFR/FR    -> AFL/FL
1006             int cur_poc = s->current_picture_ptr->poc;
1007             int *col_poc = h->ref_list[1]->field_poc;
1008             int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
1009             mb_xy= s->mb_x + ((s->mb_y&~1) + col_parity)*s->mb_stride;
1010             b8_stride = 0;
1011         }else if(!(s->picture_structure & h->ref_list[1][0].reference) && !h->ref_list[1][0].mbaff){// FL -> FL & differ parity
1012             int fieldoff= 2*(h->ref_list[1][0].reference)-3;
1013             mb_xy += s->mb_stride*fieldoff;
1014         }
1015         goto single_col;
1016     }else{                                               // AFL/AFR/FR/FL -> AFR/FR
1017         if(IS_INTERLACED(*mb_type)){                     // AFL       /FL -> AFR/FR
1018             mb_xy= s->mb_x + (s->mb_y&~1)*s->mb_stride;
1019             mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy];
1020             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride];
1021             b8_stride *= 3;
1022             b4_stride *= 6;
1023             //FIXME IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag
1024             if(    (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)
1025                 && (mb_type_col[1] & MB_TYPE_16x16_OR_INTRA)
1026                 && !is_b8x8){
1027                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1028                 *mb_type   |= MB_TYPE_16x8 |MB_TYPE_L0L1|MB_TYPE_DIRECT2; /* B_16x8 */
1029             }else{
1030                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1031                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1032             }
1033         }else{                                           //     AFR/FR    -> AFR/FR
1034 single_col:
1035             mb_type_col[0] =
1036             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy];
1037             if(IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag){
1038                 /* FIXME save sub mb types from previous frames (or derive from MVs)
1039                 * so we know exactly what block size to use */
1040                 sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1041                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1042             }else if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
1043                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1044                 *mb_type   |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1045             }else{
1046                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1047                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1048             }
1049         }
1050     }
1051
1052     l1mv0  = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]];
1053     l1mv1  = &h->ref_list[1][0].motion_val[1][h->mb2b_xy [mb_xy]];
1054     l1ref0 = &h->ref_list[1][0].ref_index [0][h->mb2b8_xy[mb_xy]];
1055     l1ref1 = &h->ref_list[1][0].ref_index [1][h->mb2b8_xy[mb_xy]];
1056     if(!b8_stride){
1057         if(s->mb_y&1){
1058             l1ref0 += h->b8_stride;
1059             l1ref1 += h->b8_stride;
1060             l1mv0  +=  2*b4_stride;
1061             l1mv1  +=  2*b4_stride;
1062         }
1063     }
1064
1065     if(h->direct_spatial_mv_pred){
1066         int ref[2];
1067         int mv[2][2];
1068         int list;
1069
1070         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1071
1072         /* ref = min(neighbors) */
1073         for(list=0; list<2; list++){
1074             int refa = h->ref_cache[list][scan8[0] - 1];
1075             int refb = h->ref_cache[list][scan8[0] - 8];
1076             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1077             if(refc == PART_NOT_AVAILABLE)
1078                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1079             ref[list] = FFMIN3((unsigned)refa, (unsigned)refb, (unsigned)refc);
1080             if(ref[list] < 0)
1081                 ref[list] = -1;
1082         }
1083
1084         if(ref[0] < 0 && ref[1] < 0){
1085             ref[0] = ref[1] = 0;
1086             mv[0][0] = mv[0][1] =
1087             mv[1][0] = mv[1][1] = 0;
1088         }else{
1089             for(list=0; list<2; list++){
1090                 if(ref[list] >= 0)
1091                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1092                 else
1093                     mv[list][0] = mv[list][1] = 0;
1094             }
1095         }
1096
1097         if(ref[1] < 0){
1098             if(!is_b8x8)
1099                 *mb_type &= ~MB_TYPE_L1;
1100             sub_mb_type &= ~MB_TYPE_L1;
1101         }else if(ref[0] < 0){
1102             if(!is_b8x8)
1103                 *mb_type &= ~MB_TYPE_L0;
1104             sub_mb_type &= ~MB_TYPE_L0;
1105         }
1106
1107         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1108             for(i8=0; i8<4; i8++){
1109                 int x8 = i8&1;
1110                 int y8 = i8>>1;
1111                 int xy8 = x8+y8*b8_stride;
1112                 int xy4 = 3*x8+y8*b4_stride;
1113                 int a=0, b=0;
1114
1115                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1116                     continue;
1117                 h->sub_mb_type[i8] = sub_mb_type;
1118
1119                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1120                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1121                 if(!IS_INTRA(mb_type_col[y8])
1122                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1123                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1124                     if(ref[0] > 0)
1125                         a= pack16to32(mv[0][0],mv[0][1]);
1126                     if(ref[1] > 0)
1127                         b= pack16to32(mv[1][0],mv[1][1]);
1128                 }else{
1129                     a= pack16to32(mv[0][0],mv[0][1]);
1130                     b= pack16to32(mv[1][0],mv[1][1]);
1131                 }
1132                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1133                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1134             }
1135         }else if(IS_16X16(*mb_type)){
1136             int a=0, b=0;
1137
1138             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1139             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1140             if(!IS_INTRA(mb_type_col[0])
1141                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1142                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1143                        && (h->x264_build>33 || !h->x264_build)))){
1144                 if(ref[0] > 0)
1145                     a= pack16to32(mv[0][0],mv[0][1]);
1146                 if(ref[1] > 0)
1147                     b= pack16to32(mv[1][0],mv[1][1]);
1148             }else{
1149                 a= pack16to32(mv[0][0],mv[0][1]);
1150                 b= pack16to32(mv[1][0],mv[1][1]);
1151             }
1152             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1153             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1154         }else{
1155             for(i8=0; i8<4; i8++){
1156                 const int x8 = i8&1;
1157                 const int y8 = i8>>1;
1158
1159                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1160                     continue;
1161                 h->sub_mb_type[i8] = sub_mb_type;
1162
1163                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1164                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1165                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1166                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1167
1168                 /* col_zero_flag */
1169                 if(!IS_INTRA(mb_type_col[0]) && (   l1ref0[x8 + y8*b8_stride] == 0
1170                                               || (l1ref0[x8 + y8*b8_stride] < 0 && l1ref1[x8 + y8*b8_stride] == 0
1171                                                   && (h->x264_build>33 || !h->x264_build)))){
1172                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*b8_stride] == 0 ? l1mv0 : l1mv1;
1173                     if(IS_SUB_8X8(sub_mb_type)){
1174                         const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1175                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1176                             if(ref[0] == 0)
1177                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1178                             if(ref[1] == 0)
1179                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1180                         }
1181                     }else
1182                     for(i4=0; i4<4; i4++){
1183                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1184                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1185                             if(ref[0] == 0)
1186                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1187                             if(ref[1] == 0)
1188                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1189                         }
1190                     }
1191                 }
1192             }
1193         }
1194     }else{ /* direct temporal mv pred */
1195         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1196         const int *dist_scale_factor = h->dist_scale_factor;
1197         int ref_offset= 0;
1198
1199         if(FRAME_MBAFF && IS_INTERLACED(*mb_type)){
1200             map_col_to_list0[0] = h->map_col_to_list0_field[s->mb_y&1][0];
1201             map_col_to_list0[1] = h->map_col_to_list0_field[s->mb_y&1][1];
1202             dist_scale_factor   =h->dist_scale_factor_field[s->mb_y&1];
1203         }
1204         if(h->ref_list[1][0].mbaff && IS_INTERLACED(mb_type_col[0]))
1205             ref_offset += 16;
1206
1207         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1208             /* FIXME assumes direct_8x8_inference == 1 */
1209             int y_shift  = 2*!IS_INTERLACED(*mb_type);
1210
1211             for(i8=0; i8<4; i8++){
1212                 const int x8 = i8&1;
1213                 const int y8 = i8>>1;
1214                 int ref0, scale;
1215                 const int16_t (*l1mv)[2]= l1mv0;
1216
1217                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1218                     continue;
1219                 h->sub_mb_type[i8] = sub_mb_type;
1220
1221                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1222                 if(IS_INTRA(mb_type_col[y8])){
1223                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1224                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1225                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1226                     continue;
1227                 }
1228
1229                 ref0 = l1ref0[x8 + y8*b8_stride];
1230                 if(ref0 >= 0)
1231                     ref0 = map_col_to_list0[0][ref0 + ref_offset];
1232                 else{
1233                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1234                     l1mv= l1mv1;
1235                 }
1236                 scale = dist_scale_factor[ref0];
1237                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1238
1239                 {
1240                     const int16_t *mv_col = l1mv[x8*3 + y8*b4_stride];
1241                     int my_col = (mv_col[1]<<y_shift)/2;
1242                     int mx = (scale * mv_col[0] + 128) >> 8;
1243                     int my = (scale * my_col + 128) >> 8;
1244                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1245                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1246                 }
1247             }
1248             return;
1249         }
1250
1251         /* one-to-one mv scaling */
1252
1253         if(IS_16X16(*mb_type)){
1254             int ref, mv0, mv1;
1255
1256             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1257             if(IS_INTRA(mb_type_col[0])){
1258                 ref=mv0=mv1=0;
1259             }else{
1260                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0] + ref_offset]
1261                                                 : map_col_to_list0[1][l1ref1[0] + ref_offset];
1262                 const int scale = dist_scale_factor[ref0];
1263                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1264                 int mv_l0[2];
1265                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1266                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1267                 ref= ref0;
1268                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1269                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1270             }
1271             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1272             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1273             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1274         }else{
1275             for(i8=0; i8<4; i8++){
1276                 const int x8 = i8&1;
1277                 const int y8 = i8>>1;
1278                 int ref0, scale;
1279                 const int16_t (*l1mv)[2]= l1mv0;
1280
1281                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1282                     continue;
1283                 h->sub_mb_type[i8] = sub_mb_type;
1284                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1285                 if(IS_INTRA(mb_type_col[0])){
1286                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1287                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1288                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1289                     continue;
1290                 }
1291
1292                 ref0 = l1ref0[x8 + y8*b8_stride] + ref_offset;
1293                 if(ref0 >= 0)
1294                     ref0 = map_col_to_list0[0][ref0];
1295                 else{
1296                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1297                     l1mv= l1mv1;
1298                 }
1299                 scale = dist_scale_factor[ref0];
1300
1301                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1302                 if(IS_SUB_8X8(sub_mb_type)){
1303                     const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1304                     int mx = (scale * mv_col[0] + 128) >> 8;
1305                     int my = (scale * mv_col[1] + 128) >> 8;
1306                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1307                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1308                 }else
1309                 for(i4=0; i4<4; i4++){
1310                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1311                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1312                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1313                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1314                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1315                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1316                 }
1317             }
1318         }
1319     }
1320 }
1321
1322 static inline void write_back_motion(H264Context *h, int mb_type){
1323     MpegEncContext * const s = &h->s;
1324     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1325     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1326     int list;
1327
1328     if(!USES_LIST(mb_type, 0))
1329         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1330
1331     for(list=0; list<h->list_count; list++){
1332         int y;
1333         if(!USES_LIST(mb_type, list))
1334             continue;
1335
1336         for(y=0; y<4; y++){
1337             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1338             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1339         }
1340         if( h->pps.cabac ) {
1341             if(IS_SKIP(mb_type))
1342                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1343             else
1344             for(y=0; y<4; y++){
1345                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1346                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1347             }
1348         }
1349
1350         {
1351             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1352             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1353             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1354             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1355             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1356         }
1357     }
1358
1359     if(h->slice_type_nos == FF_B_TYPE && h->pps.cabac){
1360         if(IS_8X8(mb_type)){
1361             uint8_t *direct_table = &h->direct_table[b8_xy];
1362             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1363             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1364             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1365         }
1366     }
1367 }
1368
1369 /**
1370  * Decodes a network abstraction layer unit.
1371  * @param consumed is the number of bytes used as input
1372  * @param length is the length of the array
1373  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1374  * @returns decoded bytes, might be src+1 if no escapes
1375  */
1376 static const uint8_t *decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1377     int i, si, di;
1378     uint8_t *dst;
1379     int bufidx;
1380
1381 //    src[0]&0x80;                //forbidden bit
1382     h->nal_ref_idc= src[0]>>5;
1383     h->nal_unit_type= src[0]&0x1F;
1384
1385     src++; length--;
1386 #if 0
1387     for(i=0; i<length; i++)
1388         printf("%2X ", src[i]);
1389 #endif
1390     for(i=0; i+1<length; i+=2){
1391         if(src[i]) continue;
1392         if(i>0 && src[i-1]==0) i--;
1393         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1394             if(src[i+2]!=3){
1395                 /* startcode, so we must be past the end */
1396                 length=i;
1397             }
1398             break;
1399         }
1400     }
1401
1402     if(i>=length-1){ //no escaped 0
1403         *dst_length= length;
1404         *consumed= length+1; //+1 for the header
1405         return src;
1406     }
1407
1408     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1409     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length+FF_INPUT_BUFFER_PADDING_SIZE);
1410     dst= h->rbsp_buffer[bufidx];
1411
1412     if (dst == NULL){
1413         return NULL;
1414     }
1415
1416 //printf("decoding esc\n");
1417     si=di=0;
1418     while(si<length){
1419         //remove escapes (very rare 1:2^22)
1420         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1421             if(src[si+2]==3){ //escape
1422                 dst[di++]= 0;
1423                 dst[di++]= 0;
1424                 si+=3;
1425                 continue;
1426             }else //next start code
1427                 break;
1428         }
1429
1430         dst[di++]= src[si++];
1431     }
1432
1433     memset(dst+di, 0, FF_INPUT_BUFFER_PADDING_SIZE);
1434
1435     *dst_length= di;
1436     *consumed= si + 1;//+1 for the header
1437 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1438     return dst;
1439 }
1440
1441 /**
1442  * identifies the exact end of the bitstream
1443  * @return the length of the trailing, or 0 if damaged
1444  */
1445 static int decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1446     int v= *src;
1447     int r;
1448
1449     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1450
1451     for(r=1; r<9; r++){
1452         if(v&1) return r;
1453         v>>=1;
1454     }
1455     return 0;
1456 }
1457
1458 /**
1459  * IDCT transforms the 16 dc values and dequantizes them.
1460  * @param qp quantization parameter
1461  */
1462 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1463 #define stride 16
1464     int i;
1465     int temp[16]; //FIXME check if this is a good idea
1466     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1467     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1468
1469 //memset(block, 64, 2*256);
1470 //return;
1471     for(i=0; i<4; i++){
1472         const int offset= y_offset[i];
1473         const int z0= block[offset+stride*0] + block[offset+stride*4];
1474         const int z1= block[offset+stride*0] - block[offset+stride*4];
1475         const int z2= block[offset+stride*1] - block[offset+stride*5];
1476         const int z3= block[offset+stride*1] + block[offset+stride*5];
1477
1478         temp[4*i+0]= z0+z3;
1479         temp[4*i+1]= z1+z2;
1480         temp[4*i+2]= z1-z2;
1481         temp[4*i+3]= z0-z3;
1482     }
1483
1484     for(i=0; i<4; i++){
1485         const int offset= x_offset[i];
1486         const int z0= temp[4*0+i] + temp[4*2+i];
1487         const int z1= temp[4*0+i] - temp[4*2+i];
1488         const int z2= temp[4*1+i] - temp[4*3+i];
1489         const int z3= temp[4*1+i] + temp[4*3+i];
1490
1491         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
1492         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1493         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1494         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1495     }
1496 }
1497
1498 #if 0
1499 /**
1500  * DCT transforms the 16 dc values.
1501  * @param qp quantization parameter ??? FIXME
1502  */
1503 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1504 //    const int qmul= dequant_coeff[qp][0];
1505     int i;
1506     int temp[16]; //FIXME check if this is a good idea
1507     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1508     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1509
1510     for(i=0; i<4; i++){
1511         const int offset= y_offset[i];
1512         const int z0= block[offset+stride*0] + block[offset+stride*4];
1513         const int z1= block[offset+stride*0] - block[offset+stride*4];
1514         const int z2= block[offset+stride*1] - block[offset+stride*5];
1515         const int z3= block[offset+stride*1] + block[offset+stride*5];
1516
1517         temp[4*i+0]= z0+z3;
1518         temp[4*i+1]= z1+z2;
1519         temp[4*i+2]= z1-z2;
1520         temp[4*i+3]= z0-z3;
1521     }
1522
1523     for(i=0; i<4; i++){
1524         const int offset= x_offset[i];
1525         const int z0= temp[4*0+i] + temp[4*2+i];
1526         const int z1= temp[4*0+i] - temp[4*2+i];
1527         const int z2= temp[4*1+i] - temp[4*3+i];
1528         const int z3= temp[4*1+i] + temp[4*3+i];
1529
1530         block[stride*0 +offset]= (z0 + z3)>>1;
1531         block[stride*2 +offset]= (z1 + z2)>>1;
1532         block[stride*8 +offset]= (z1 - z2)>>1;
1533         block[stride*10+offset]= (z0 - z3)>>1;
1534     }
1535 }
1536 #endif
1537
1538 #undef xStride
1539 #undef stride
1540
1541 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1542     const int stride= 16*2;
1543     const int xStride= 16;
1544     int a,b,c,d,e;
1545
1546     a= block[stride*0 + xStride*0];
1547     b= block[stride*0 + xStride*1];
1548     c= block[stride*1 + xStride*0];
1549     d= block[stride*1 + xStride*1];
1550
1551     e= a-b;
1552     a= a+b;
1553     b= c-d;
1554     c= c+d;
1555
1556     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1557     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1558     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1559     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1560 }
1561
1562 #if 0
1563 static void chroma_dc_dct_c(DCTELEM *block){
1564     const int stride= 16*2;
1565     const int xStride= 16;
1566     int a,b,c,d,e;
1567
1568     a= block[stride*0 + xStride*0];
1569     b= block[stride*0 + xStride*1];
1570     c= block[stride*1 + xStride*0];
1571     d= block[stride*1 + xStride*1];
1572
1573     e= a-b;
1574     a= a+b;
1575     b= c-d;
1576     c= c+d;
1577
1578     block[stride*0 + xStride*0]= (a+c);
1579     block[stride*0 + xStride*1]= (e+b);
1580     block[stride*1 + xStride*0]= (a-c);
1581     block[stride*1 + xStride*1]= (e-b);
1582 }
1583 #endif
1584
1585 /**
1586  * gets the chroma qp.
1587  */
1588 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1589     return h->pps.chroma_qp_table[t][qscale];
1590 }
1591
1592 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1593                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1594                            int src_x_offset, int src_y_offset,
1595                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1596     MpegEncContext * const s = &h->s;
1597     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1598     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1599     const int luma_xy= (mx&3) + ((my&3)<<2);
1600     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1601     uint8_t * src_cb, * src_cr;
1602     int extra_width= h->emu_edge_width;
1603     int extra_height= h->emu_edge_height;
1604     int emu=0;
1605     const int full_mx= mx>>2;
1606     const int full_my= my>>2;
1607     const int pic_width  = 16*s->mb_width;
1608     const int pic_height = 16*s->mb_height >> MB_FIELD;
1609
1610     if(mx&7) extra_width -= 3;
1611     if(my&7) extra_height -= 3;
1612
1613     if(   full_mx < 0-extra_width
1614        || full_my < 0-extra_height
1615        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1616        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1617         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1618             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1619         emu=1;
1620     }
1621
1622     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1623     if(!square){
1624         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1625     }
1626
1627     if(ENABLE_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1628
1629     if(MB_FIELD){
1630         // chroma offset when predicting from a field of opposite parity
1631         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1632         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1633     }
1634     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1635     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1636
1637     if(emu){
1638         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1639             src_cb= s->edge_emu_buffer;
1640     }
1641     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1642
1643     if(emu){
1644         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1645             src_cr= s->edge_emu_buffer;
1646     }
1647     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1648 }
1649
1650 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1651                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1652                            int x_offset, int y_offset,
1653                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1654                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1655                            int list0, int list1){
1656     MpegEncContext * const s = &h->s;
1657     qpel_mc_func *qpix_op=  qpix_put;
1658     h264_chroma_mc_func chroma_op= chroma_put;
1659
1660     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1661     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1662     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1663     x_offset += 8*s->mb_x;
1664     y_offset += 8*(s->mb_y >> MB_FIELD);
1665
1666     if(list0){
1667         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1668         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1669                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1670                            qpix_op, chroma_op);
1671
1672         qpix_op=  qpix_avg;
1673         chroma_op= chroma_avg;
1674     }
1675
1676     if(list1){
1677         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1678         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1679                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1680                            qpix_op, chroma_op);
1681     }
1682 }
1683
1684 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1685                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1686                            int x_offset, int y_offset,
1687                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1688                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1689                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1690                            int list0, int list1){
1691     MpegEncContext * const s = &h->s;
1692
1693     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1694     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1695     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1696     x_offset += 8*s->mb_x;
1697     y_offset += 8*(s->mb_y >> MB_FIELD);
1698
1699     if(list0 && list1){
1700         /* don't optimize for luma-only case, since B-frames usually
1701          * use implicit weights => chroma too. */
1702         uint8_t *tmp_cb = s->obmc_scratchpad;
1703         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1704         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1705         int refn0 = h->ref_cache[0][ scan8[n] ];
1706         int refn1 = h->ref_cache[1][ scan8[n] ];
1707
1708         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1709                     dest_y, dest_cb, dest_cr,
1710                     x_offset, y_offset, qpix_put, chroma_put);
1711         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1712                     tmp_y, tmp_cb, tmp_cr,
1713                     x_offset, y_offset, qpix_put, chroma_put);
1714
1715         if(h->use_weight == 2){
1716             int weight0 = h->implicit_weight[refn0][refn1];
1717             int weight1 = 64 - weight0;
1718             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1719             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1720             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1721         }else{
1722             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1723                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1724                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1725             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1726                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1727                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1728             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1729                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1730                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1731         }
1732     }else{
1733         int list = list1 ? 1 : 0;
1734         int refn = h->ref_cache[list][ scan8[n] ];
1735         Picture *ref= &h->ref_list[list][refn];
1736         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1737                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1738                     qpix_put, chroma_put);
1739
1740         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1741                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1742         if(h->use_weight_chroma){
1743             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1744                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1745             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1746                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1747         }
1748     }
1749 }
1750
1751 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1752                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1753                            int x_offset, int y_offset,
1754                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1755                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1756                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1757                            int list0, int list1){
1758     if((h->use_weight==2 && list0 && list1
1759         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1760        || h->use_weight==1)
1761         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1762                          x_offset, y_offset, qpix_put, chroma_put,
1763                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1764     else
1765         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1766                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1767 }
1768
1769 static inline void prefetch_motion(H264Context *h, int list){
1770     /* fetch pixels for estimated mv 4 macroblocks ahead
1771      * optimized for 64byte cache lines */
1772     MpegEncContext * const s = &h->s;
1773     const int refn = h->ref_cache[list][scan8[0]];
1774     if(refn >= 0){
1775         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1776         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1777         uint8_t **src= h->ref_list[list][refn].data;
1778         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1779         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1780         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1781         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1782     }
1783 }
1784
1785 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1786                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1787                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1788                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1789     MpegEncContext * const s = &h->s;
1790     const int mb_xy= h->mb_xy;
1791     const int mb_type= s->current_picture.mb_type[mb_xy];
1792
1793     assert(IS_INTER(mb_type));
1794
1795     prefetch_motion(h, 0);
1796
1797     if(IS_16X16(mb_type)){
1798         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1799                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1800                 &weight_op[0], &weight_avg[0],
1801                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1802     }else if(IS_16X8(mb_type)){
1803         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1804                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1805                 &weight_op[1], &weight_avg[1],
1806                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1807         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1808                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1809                 &weight_op[1], &weight_avg[1],
1810                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1811     }else if(IS_8X16(mb_type)){
1812         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1813                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1814                 &weight_op[2], &weight_avg[2],
1815                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1816         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1817                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1818                 &weight_op[2], &weight_avg[2],
1819                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1820     }else{
1821         int i;
1822
1823         assert(IS_8X8(mb_type));
1824
1825         for(i=0; i<4; i++){
1826             const int sub_mb_type= h->sub_mb_type[i];
1827             const int n= 4*i;
1828             int x_offset= (i&1)<<2;
1829             int y_offset= (i&2)<<1;
1830
1831             if(IS_SUB_8X8(sub_mb_type)){
1832                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1833                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1834                     &weight_op[3], &weight_avg[3],
1835                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1836             }else if(IS_SUB_8X4(sub_mb_type)){
1837                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1838                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1839                     &weight_op[4], &weight_avg[4],
1840                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1841                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1842                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1843                     &weight_op[4], &weight_avg[4],
1844                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1845             }else if(IS_SUB_4X8(sub_mb_type)){
1846                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1847                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1848                     &weight_op[5], &weight_avg[5],
1849                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1850                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1851                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1852                     &weight_op[5], &weight_avg[5],
1853                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1854             }else{
1855                 int j;
1856                 assert(IS_SUB_4X4(sub_mb_type));
1857                 for(j=0; j<4; j++){
1858                     int sub_x_offset= x_offset + 2*(j&1);
1859                     int sub_y_offset= y_offset +   (j&2);
1860                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1861                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1862                         &weight_op[6], &weight_avg[6],
1863                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1864                 }
1865             }
1866         }
1867     }
1868
1869     prefetch_motion(h, 1);
1870 }
1871
1872 static av_cold void decode_init_vlc(void){
1873     static int done = 0;
1874
1875     if (!done) {
1876         int i;
1877         int offset;
1878         done = 1;
1879
1880         chroma_dc_coeff_token_vlc.table = chroma_dc_coeff_token_vlc_table;
1881         chroma_dc_coeff_token_vlc.table_allocated = chroma_dc_coeff_token_vlc_table_size;
1882         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1883                  &chroma_dc_coeff_token_len [0], 1, 1,
1884                  &chroma_dc_coeff_token_bits[0], 1, 1,
1885                  INIT_VLC_USE_NEW_STATIC);
1886
1887         offset = 0;
1888         for(i=0; i<4; i++){
1889             coeff_token_vlc[i].table = coeff_token_vlc_tables+offset;
1890             coeff_token_vlc[i].table_allocated = coeff_token_vlc_tables_size[i];
1891             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1892                      &coeff_token_len [i][0], 1, 1,
1893                      &coeff_token_bits[i][0], 1, 1,
1894                      INIT_VLC_USE_NEW_STATIC);
1895             offset += coeff_token_vlc_tables_size[i];
1896         }
1897         /*
1898          * This is a one time safety check to make sure that
1899          * the packed static coeff_token_vlc table sizes
1900          * were initialized correctly.
1901          */
1902         assert(offset == FF_ARRAY_ELEMS(coeff_token_vlc_tables));
1903
1904         for(i=0; i<3; i++){
1905             chroma_dc_total_zeros_vlc[i].table = chroma_dc_total_zeros_vlc_tables[i];
1906             chroma_dc_total_zeros_vlc[i].table_allocated = chroma_dc_total_zeros_vlc_tables_size;
1907             init_vlc(&chroma_dc_total_zeros_vlc[i],
1908                      CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1909                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1910                      &chroma_dc_total_zeros_bits[i][0], 1, 1,
1911                      INIT_VLC_USE_NEW_STATIC);
1912         }
1913         for(i=0; i<15; i++){
1914             total_zeros_vlc[i].table = total_zeros_vlc_tables[i];
1915             total_zeros_vlc[i].table_allocated = total_zeros_vlc_tables_size;
1916             init_vlc(&total_zeros_vlc[i],
1917                      TOTAL_ZEROS_VLC_BITS, 16,
1918                      &total_zeros_len [i][0], 1, 1,
1919                      &total_zeros_bits[i][0], 1, 1,
1920                      INIT_VLC_USE_NEW_STATIC);
1921         }
1922
1923         for(i=0; i<6; i++){
1924             run_vlc[i].table = run_vlc_tables[i];
1925             run_vlc[i].table_allocated = run_vlc_tables_size;
1926             init_vlc(&run_vlc[i],
1927                      RUN_VLC_BITS, 7,
1928                      &run_len [i][0], 1, 1,
1929                      &run_bits[i][0], 1, 1,
1930                      INIT_VLC_USE_NEW_STATIC);
1931         }
1932         run7_vlc.table = run7_vlc_table,
1933         run7_vlc.table_allocated = run7_vlc_table_size;
1934         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
1935                  &run_len [6][0], 1, 1,
1936                  &run_bits[6][0], 1, 1,
1937                  INIT_VLC_USE_NEW_STATIC);
1938     }
1939 }
1940
1941 static void free_tables(H264Context *h){
1942     int i;
1943     H264Context *hx;
1944     av_freep(&h->intra4x4_pred_mode);
1945     av_freep(&h->chroma_pred_mode_table);
1946     av_freep(&h->cbp_table);
1947     av_freep(&h->mvd_table[0]);
1948     av_freep(&h->mvd_table[1]);
1949     av_freep(&h->direct_table);
1950     av_freep(&h->non_zero_count);
1951     av_freep(&h->slice_table_base);
1952     h->slice_table= NULL;
1953
1954     av_freep(&h->mb2b_xy);
1955     av_freep(&h->mb2b8_xy);
1956
1957     for(i = 0; i < h->s.avctx->thread_count; i++) {
1958         hx = h->thread_context[i];
1959         if(!hx) continue;
1960         av_freep(&hx->top_borders[1]);
1961         av_freep(&hx->top_borders[0]);
1962         av_freep(&hx->s.obmc_scratchpad);
1963     }
1964 }
1965
1966 static void init_dequant8_coeff_table(H264Context *h){
1967     int i,q,x;
1968     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
1969     h->dequant8_coeff[0] = h->dequant8_buffer[0];
1970     h->dequant8_coeff[1] = h->dequant8_buffer[1];
1971
1972     for(i=0; i<2; i++ ){
1973         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
1974             h->dequant8_coeff[1] = h->dequant8_buffer[0];
1975             break;
1976         }
1977
1978         for(q=0; q<52; q++){
1979             int shift = div6[q];
1980             int idx = rem6[q];
1981             for(x=0; x<64; x++)
1982                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
1983                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
1984                     h->pps.scaling_matrix8[i][x]) << shift;
1985         }
1986     }
1987 }
1988
1989 static void init_dequant4_coeff_table(H264Context *h){
1990     int i,j,q,x;
1991     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
1992     for(i=0; i<6; i++ ){
1993         h->dequant4_coeff[i] = h->dequant4_buffer[i];
1994         for(j=0; j<i; j++){
1995             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
1996                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
1997                 break;
1998             }
1999         }
2000         if(j<i)
2001             continue;
2002
2003         for(q=0; q<52; q++){
2004             int shift = div6[q] + 2;
2005             int idx = rem6[q];
2006             for(x=0; x<16; x++)
2007                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2008                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2009                     h->pps.scaling_matrix4[i][x]) << shift;
2010         }
2011     }
2012 }
2013
2014 static void init_dequant_tables(H264Context *h){
2015     int i,x;
2016     init_dequant4_coeff_table(h);
2017     if(h->pps.transform_8x8_mode)
2018         init_dequant8_coeff_table(h);
2019     if(h->sps.transform_bypass){
2020         for(i=0; i<6; i++)
2021             for(x=0; x<16; x++)
2022                 h->dequant4_coeff[i][0][x] = 1<<6;
2023         if(h->pps.transform_8x8_mode)
2024             for(i=0; i<2; i++)
2025                 for(x=0; x<64; x++)
2026                     h->dequant8_coeff[i][0][x] = 1<<6;
2027     }
2028 }
2029
2030
2031 /**
2032  * allocates tables.
2033  * needs width/height
2034  */
2035 static int alloc_tables(H264Context *h){
2036     MpegEncContext * const s = &h->s;
2037     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2038     int x,y;
2039
2040     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2041
2042     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2043     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(*h->slice_table_base))
2044     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2045
2046     CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2047     CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2048     CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2049     CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2050
2051     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(*h->slice_table_base));
2052     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2053
2054     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2055     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2056     for(y=0; y<s->mb_height; y++){
2057         for(x=0; x<s->mb_width; x++){
2058             const int mb_xy= x + y*s->mb_stride;
2059             const int b_xy = 4*x + 4*y*h->b_stride;
2060             const int b8_xy= 2*x + 2*y*h->b8_stride;
2061
2062             h->mb2b_xy [mb_xy]= b_xy;
2063             h->mb2b8_xy[mb_xy]= b8_xy;
2064         }
2065     }
2066
2067     s->obmc_scratchpad = NULL;
2068
2069     if(!h->dequant4_coeff[0])
2070         init_dequant_tables(h);
2071
2072     return 0;
2073 fail:
2074     free_tables(h);
2075     return -1;
2076 }
2077
2078 /**
2079  * Mimic alloc_tables(), but for every context thread.
2080  */
2081 static void clone_tables(H264Context *dst, H264Context *src){
2082     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2083     dst->non_zero_count           = src->non_zero_count;
2084     dst->slice_table              = src->slice_table;
2085     dst->cbp_table                = src->cbp_table;
2086     dst->mb2b_xy                  = src->mb2b_xy;
2087     dst->mb2b8_xy                 = src->mb2b8_xy;
2088     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2089     dst->mvd_table[0]             = src->mvd_table[0];
2090     dst->mvd_table[1]             = src->mvd_table[1];
2091     dst->direct_table             = src->direct_table;
2092
2093     dst->s.obmc_scratchpad = NULL;
2094     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2095 }
2096
2097 /**
2098  * Init context
2099  * Allocate buffers which are not shared amongst multiple threads.
2100  */
2101 static int context_init(H264Context *h){
2102     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2103     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2104
2105     return 0;
2106 fail:
2107     return -1; // free_tables will clean up for us
2108 }
2109
2110 static av_cold void common_init(H264Context *h){
2111     MpegEncContext * const s = &h->s;
2112
2113     s->width = s->avctx->width;
2114     s->height = s->avctx->height;
2115     s->codec_id= s->avctx->codec->id;
2116
2117     ff_h264_pred_init(&h->hpc, s->codec_id);
2118
2119     h->dequant_coeff_pps= -1;
2120     s->unrestricted_mv=1;
2121     s->decode=1; //FIXME
2122
2123     dsputil_init(&s->dsp, s->avctx); // needed so that idct permutation is known early
2124
2125     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2126     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2127 }
2128
2129 static av_cold int decode_init(AVCodecContext *avctx){
2130     H264Context *h= avctx->priv_data;
2131     MpegEncContext * const s = &h->s;
2132
2133     MPV_decode_defaults(s);
2134
2135     s->avctx = avctx;
2136     common_init(h);
2137
2138     s->out_format = FMT_H264;
2139     s->workaround_bugs= avctx->workaround_bugs;
2140
2141     // set defaults
2142 //    s->decode_mb= ff_h263_decode_mb;
2143     s->quarter_sample = 1;
2144     s->low_delay= 1;
2145
2146     if(avctx->codec_id == CODEC_ID_SVQ3)
2147         avctx->pix_fmt= PIX_FMT_YUVJ420P;
2148     else
2149         avctx->pix_fmt= PIX_FMT_YUV420P;
2150
2151     decode_init_vlc();
2152
2153     if(avctx->extradata_size > 0 && avctx->extradata &&
2154        *(char *)avctx->extradata == 1){
2155         h->is_avc = 1;
2156         h->got_avcC = 0;
2157     } else {
2158         h->is_avc = 0;
2159     }
2160
2161     h->thread_context[0] = h;
2162     h->outputed_poc = INT_MIN;
2163     h->prev_poc_msb= 1<<16;
2164     return 0;
2165 }
2166
2167 static int frame_start(H264Context *h){
2168     MpegEncContext * const s = &h->s;
2169     int i;
2170
2171     if(MPV_frame_start(s, s->avctx) < 0)
2172         return -1;
2173     ff_er_frame_start(s);
2174     /*
2175      * MPV_frame_start uses pict_type to derive key_frame.
2176      * This is incorrect for H.264; IDR markings must be used.
2177      * Zero here; IDR markings per slice in frame or fields are ORed in later.
2178      * See decode_nal_units().
2179      */
2180     s->current_picture_ptr->key_frame= 0;
2181
2182     assert(s->linesize && s->uvlinesize);
2183
2184     for(i=0; i<16; i++){
2185         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2186         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2187     }
2188     for(i=0; i<4; i++){
2189         h->block_offset[16+i]=
2190         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2191         h->block_offset[24+16+i]=
2192         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2193     }
2194
2195     /* can't be in alloc_tables because linesize isn't known there.
2196      * FIXME: redo bipred weight to not require extra buffer? */
2197     for(i = 0; i < s->avctx->thread_count; i++)
2198         if(!h->thread_context[i]->s.obmc_scratchpad)
2199             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2200
2201     /* some macroblocks will be accessed before they're available */
2202     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2203         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(*h->slice_table));
2204
2205 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2206
2207     // We mark the current picture as non-reference after allocating it, so
2208     // that if we break out due to an error it can be released automatically
2209     // in the next MPV_frame_start().
2210     // SVQ3 as well as most other codecs have only last/next/current and thus
2211     // get released even with set reference, besides SVQ3 and others do not
2212     // mark frames as reference later "naturally".
2213     if(s->codec_id != CODEC_ID_SVQ3)
2214         s->current_picture_ptr->reference= 0;
2215
2216     s->current_picture_ptr->field_poc[0]=
2217     s->current_picture_ptr->field_poc[1]= INT_MAX;
2218     assert(s->current_picture_ptr->long_ref==0);
2219
2220     return 0;
2221 }
2222
2223 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2224     MpegEncContext * const s = &h->s;
2225     int i;
2226     int step    = 1;
2227     int offset  = 1;
2228     int uvoffset= 1;
2229     int top_idx = 1;
2230     int skiplast= 0;
2231
2232     src_y  -=   linesize;
2233     src_cb -= uvlinesize;
2234     src_cr -= uvlinesize;
2235
2236     if(!simple && FRAME_MBAFF){
2237         if(s->mb_y&1){
2238             offset  = MB_MBAFF ? 1 : 17;
2239             uvoffset= MB_MBAFF ? 1 : 9;
2240             if(!MB_MBAFF){
2241                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 0)= *(uint64_t*)(src_y +  15*linesize);
2242                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 8)= *(uint64_t*)(src_y +8+15*linesize);
2243                 if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2244                     *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+7*uvlinesize);
2245                     *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+7*uvlinesize);
2246                 }
2247             }
2248         }else{
2249             if(!MB_MBAFF){
2250                 h->left_border[0]= h->top_borders[0][s->mb_x][15];
2251                 if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2252                     h->left_border[34   ]= h->top_borders[0][s->mb_x][16+7  ];
2253                     h->left_border[34+18]= h->top_borders[0][s->mb_x][16+8+7];
2254                 }
2255                 skiplast= 1;
2256             }
2257             offset  =
2258             uvoffset=
2259             top_idx = MB_MBAFF ? 0 : 1;
2260         }
2261         step= MB_MBAFF ? 2 : 1;
2262     }
2263
2264     // There are two lines saved, the line above the the top macroblock of a pair,
2265     // and the line above the bottom macroblock
2266     h->left_border[offset]= h->top_borders[top_idx][s->mb_x][15];
2267     for(i=1; i<17 - skiplast; i++){
2268         h->left_border[offset+i*step]= src_y[15+i*  linesize];
2269     }
2270
2271     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2272     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2273
2274     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2275         h->left_border[uvoffset+34   ]= h->top_borders[top_idx][s->mb_x][16+7];
2276         h->left_border[uvoffset+34+18]= h->top_borders[top_idx][s->mb_x][24+7];
2277         for(i=1; i<9 - skiplast; i++){
2278             h->left_border[uvoffset+34   +i*step]= src_cb[7+i*uvlinesize];
2279             h->left_border[uvoffset+34+18+i*step]= src_cr[7+i*uvlinesize];
2280         }
2281         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2282         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2283     }
2284 }
2285
2286 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2287     MpegEncContext * const s = &h->s;
2288     int temp8, i;
2289     uint64_t temp64;
2290     int deblock_left;
2291     int deblock_top;
2292     int mb_xy;
2293     int step    = 1;
2294     int offset  = 1;
2295     int uvoffset= 1;
2296     int top_idx = 1;
2297
2298     if(!simple && FRAME_MBAFF){
2299         if(s->mb_y&1){
2300             offset  = MB_MBAFF ? 1 : 17;
2301             uvoffset= MB_MBAFF ? 1 : 9;
2302         }else{
2303             offset  =
2304             uvoffset=
2305             top_idx = MB_MBAFF ? 0 : 1;
2306         }
2307         step= MB_MBAFF ? 2 : 1;
2308     }
2309
2310     if(h->deblocking_filter == 2) {
2311         mb_xy = h->mb_xy;
2312         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2313         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2314     } else {
2315         deblock_left = (s->mb_x > 0);
2316         deblock_top =  (s->mb_y > !!MB_FIELD);
2317     }
2318
2319     src_y  -=   linesize + 1;
2320     src_cb -= uvlinesize + 1;
2321     src_cr -= uvlinesize + 1;
2322
2323 #define XCHG(a,b,t,xchg)\
2324 t= a;\
2325 if(xchg)\
2326     a= b;\
2327 b= t;
2328
2329     if(deblock_left){
2330         for(i = !deblock_top; i<16; i++){
2331             XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, xchg);
2332         }
2333         XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, 1);
2334     }
2335
2336     if(deblock_top){
2337         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2338         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2339         if(s->mb_x+1 < s->mb_width){
2340             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2341         }
2342     }
2343
2344     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2345         if(deblock_left){
2346             for(i = !deblock_top; i<8; i++){
2347                 XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, xchg);
2348                 XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, xchg);
2349             }
2350             XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, 1);
2351             XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, 1);
2352         }
2353         if(deblock_top){
2354             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2355             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2356         }
2357     }
2358 }
2359
2360 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2361     MpegEncContext * const s = &h->s;
2362     const int mb_x= s->mb_x;
2363     const int mb_y= s->mb_y;
2364     const int mb_xy= h->mb_xy;
2365     const int mb_type= s->current_picture.mb_type[mb_xy];
2366     uint8_t  *dest_y, *dest_cb, *dest_cr;
2367     int linesize, uvlinesize /*dct_offset*/;
2368     int i;
2369     int *block_offset = &h->block_offset[0];
2370     const int transform_bypass = !simple && (s->qscale == 0 && h->sps.transform_bypass);
2371     const int is_h264 = simple || s->codec_id == CODEC_ID_H264;
2372     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2373     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2374
2375     dest_y  = s->current_picture.data[0] + (mb_x + mb_y * s->linesize  ) * 16;
2376     dest_cb = s->current_picture.data[1] + (mb_x + mb_y * s->uvlinesize) * 8;
2377     dest_cr = s->current_picture.data[2] + (mb_x + mb_y * s->uvlinesize) * 8;
2378
2379     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2380     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2381
2382     if (!simple && MB_FIELD) {
2383         linesize   = h->mb_linesize   = s->linesize * 2;
2384         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2385         block_offset = &h->block_offset[24];
2386         if(mb_y&1){ //FIXME move out of this function?
2387             dest_y -= s->linesize*15;
2388             dest_cb-= s->uvlinesize*7;
2389             dest_cr-= s->uvlinesize*7;
2390         }
2391         if(FRAME_MBAFF) {
2392             int list;
2393             for(list=0; list<h->list_count; list++){
2394                 if(!USES_LIST(mb_type, list))
2395                     continue;
2396                 if(IS_16X16(mb_type)){
2397                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2398                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2399                 }else{
2400                     for(i=0; i<16; i+=4){
2401                         int ref = h->ref_cache[list][scan8[i]];
2402                         if(ref >= 0)
2403                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2404                     }
2405                 }
2406             }
2407         }
2408     } else {
2409         linesize   = h->mb_linesize   = s->linesize;
2410         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2411 //        dct_offset = s->linesize * 16;
2412     }
2413
2414     if (!simple && IS_INTRA_PCM(mb_type)) {
2415         for (i=0; i<16; i++) {
2416             memcpy(dest_y + i*  linesize, h->mb       + i*8, 16);
2417         }
2418         for (i=0; i<8; i++) {
2419             memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
2420             memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
2421         }
2422     } else {
2423         if(IS_INTRA(mb_type)){
2424             if(h->deblocking_filter)
2425                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2426
2427             if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2428                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2429                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2430             }
2431
2432             if(IS_INTRA4x4(mb_type)){
2433                 if(simple || !s->encoding){
2434                     if(IS_8x8DCT(mb_type)){
2435                         if(transform_bypass){
2436                             idct_dc_add =
2437                             idct_add    = s->dsp.add_pixels8;
2438                         }else{
2439                             idct_dc_add = s->dsp.h264_idct8_dc_add;
2440                             idct_add    = s->dsp.h264_idct8_add;
2441                         }
2442                         for(i=0; i<16; i+=4){
2443                             uint8_t * const ptr= dest_y + block_offset[i];
2444                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2445                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2446                                 h->hpc.pred8x8l_add[dir](ptr, h->mb + i*16, linesize);
2447                             }else{
2448                                 const int nnz = h->non_zero_count_cache[ scan8[i] ];
2449                                 h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2450                                                             (h->topright_samples_available<<i)&0x4000, linesize);
2451                                 if(nnz){
2452                                     if(nnz == 1 && h->mb[i*16])
2453                                         idct_dc_add(ptr, h->mb + i*16, linesize);
2454                                     else
2455                                         idct_add   (ptr, h->mb + i*16, linesize);
2456                                 }
2457                             }
2458                         }
2459                     }else{
2460                         if(transform_bypass){
2461                             idct_dc_add =
2462                             idct_add    = s->dsp.add_pixels4;
2463                         }else{
2464                             idct_dc_add = s->dsp.h264_idct_dc_add;
2465                             idct_add    = s->dsp.h264_idct_add;
2466                         }
2467                         for(i=0; i<16; i++){
2468                             uint8_t * const ptr= dest_y + block_offset[i];
2469                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2470
2471                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2472                                 h->hpc.pred4x4_add[dir](ptr, h->mb + i*16, linesize);
2473                             }else{
2474                                 uint8_t *topright;
2475                                 int nnz, tr;
2476                                 if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2477                                     const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2478                                     assert(mb_y || linesize <= block_offset[i]);
2479                                     if(!topright_avail){
2480                                         tr= ptr[3 - linesize]*0x01010101;
2481                                         topright= (uint8_t*) &tr;
2482                                     }else
2483                                         topright= ptr + 4 - linesize;
2484                                 }else
2485                                     topright= NULL;
2486
2487                                 h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2488                                 nnz = h->non_zero_count_cache[ scan8[i] ];
2489                                 if(nnz){
2490                                     if(is_h264){
2491                                         if(nnz == 1 && h->mb[i*16])
2492                                             idct_dc_add(ptr, h->mb + i*16, linesize);
2493                                         else
2494                                             idct_add   (ptr, h->mb + i*16, linesize);
2495                                     }else
2496                                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2497                                 }
2498                             }
2499                         }
2500                     }
2501                 }
2502             }else{
2503                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2504                 if(is_h264){
2505                     if(!transform_bypass)
2506                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2507                 }else
2508                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2509             }
2510             if(h->deblocking_filter)
2511                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2512         }else if(is_h264){
2513             hl_motion(h, dest_y, dest_cb, dest_cr,
2514                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2515                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2516                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2517         }
2518
2519
2520         if(!IS_INTRA4x4(mb_type)){
2521             if(is_h264){
2522                 if(IS_INTRA16x16(mb_type)){
2523                     if(transform_bypass){
2524                         if(h->sps.profile_idc==244 && (h->intra16x16_pred_mode==VERT_PRED8x8 || h->intra16x16_pred_mode==HOR_PRED8x8)){
2525                             h->hpc.pred16x16_add[h->intra16x16_pred_mode](dest_y, block_offset, h->mb, linesize);
2526                         }else{
2527                             for(i=0; i<16; i++){
2528                                 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2529                                     s->dsp.add_pixels4(dest_y + block_offset[i], h->mb + i*16, linesize);
2530                             }
2531                         }
2532                     }else{
2533                          s->dsp.h264_idct_add16intra(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2534                     }
2535                 }else if(h->cbp&15){
2536                     if(transform_bypass){
2537                         const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2538                         idct_add= IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2539                         for(i=0; i<16; i+=di){
2540                             if(h->non_zero_count_cache[ scan8[i] ]){
2541                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2542                             }
2543                         }
2544                     }else{
2545                         if(IS_8x8DCT(mb_type)){
2546                             s->dsp.h264_idct8_add4(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2547                         }else{
2548                             s->dsp.h264_idct_add16(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2549                         }
2550                     }
2551                 }
2552             }else{
2553                 for(i=0; i<16; i++){
2554                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2555                         uint8_t * const ptr= dest_y + block_offset[i];
2556                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2557                     }
2558                 }
2559             }
2560         }
2561
2562         if((simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)) && (h->cbp&0x30)){
2563             uint8_t *dest[2] = {dest_cb, dest_cr};
2564             if(transform_bypass){
2565                 if(IS_INTRA(mb_type) && h->sps.profile_idc==244 && (h->chroma_pred_mode==VERT_PRED8x8 || h->chroma_pred_mode==HOR_PRED8x8)){
2566                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0], block_offset + 16, h->mb + 16*16, uvlinesize);
2567                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1], block_offset + 20, h->mb + 20*16, uvlinesize);
2568                 }else{
2569                     idct_add = s->dsp.add_pixels4;
2570                     for(i=16; i<16+8; i++){
2571                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2572                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2573                     }
2574                 }
2575             }else{
2576                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2577                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2578                 if(is_h264){
2579                     idct_add = s->dsp.h264_idct_add;
2580                     idct_dc_add = s->dsp.h264_idct_dc_add;
2581                     for(i=16; i<16+8; i++){
2582                         if(h->non_zero_count_cache[ scan8[i] ])
2583                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2584                         else if(h->mb[i*16])
2585                             idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2586                     }
2587                 }else{
2588                     for(i=16; i<16+8; i++){
2589                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2590                             uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2591                             svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2592                         }
2593                     }
2594                 }
2595             }
2596         }
2597     }
2598     if(h->deblocking_filter) {
2599         backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2600         fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2601         h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2602         h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2603         if (!simple && FRAME_MBAFF) {
2604             filter_mb     (h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2605         } else {
2606             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2607         }
2608     }
2609 }
2610
2611 /**
2612  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2613  */
2614 static void hl_decode_mb_simple(H264Context *h){
2615     hl_decode_mb_internal(h, 1);
2616 }
2617
2618 /**
2619  * Process a macroblock; this handles edge cases, such as interlacing.
2620  */
2621 static void av_noinline hl_decode_mb_complex(H264Context *h){
2622     hl_decode_mb_internal(h, 0);
2623 }
2624
2625 static void hl_decode_mb(H264Context *h){
2626     MpegEncContext * const s = &h->s;
2627     const int mb_xy= h->mb_xy;
2628     const int mb_type= s->current_picture.mb_type[mb_xy];
2629     int is_complex = ENABLE_SMALL || h->is_complex || IS_INTRA_PCM(mb_type) || s->qscale == 0;
2630
2631     if(ENABLE_H264_ENCODER && !s->decode)
2632         return;
2633
2634     if (is_complex)
2635         hl_decode_mb_complex(h);
2636     else hl_decode_mb_simple(h);
2637 }
2638
2639 static void pic_as_field(Picture *pic, const int parity){
2640     int i;
2641     for (i = 0; i < 4; ++i) {
2642         if (parity == PICT_BOTTOM_FIELD)
2643             pic->data[i] += pic->linesize[i];
2644         pic->reference = parity;
2645         pic->linesize[i] *= 2;
2646     }
2647     pic->poc= pic->field_poc[parity == PICT_BOTTOM_FIELD];
2648 }
2649
2650 static int split_field_copy(Picture *dest, Picture *src,
2651                             int parity, int id_add){
2652     int match = !!(src->reference & parity);
2653
2654     if (match) {
2655         *dest = *src;
2656         if(parity != PICT_FRAME){
2657             pic_as_field(dest, parity);
2658             dest->pic_id *= 2;
2659             dest->pic_id += id_add;
2660         }
2661     }
2662
2663     return match;
2664 }
2665
2666 static int build_def_list(Picture *def, Picture **in, int len, int is_long, int sel){
2667     int i[2]={0};
2668     int index=0;
2669
2670     while(i[0]<len || i[1]<len){
2671         while(i[0]<len && !(in[ i[0] ] && (in[ i[0] ]->reference & sel)))
2672             i[0]++;
2673         while(i[1]<len && !(in[ i[1] ] && (in[ i[1] ]->reference & (sel^3))))
2674             i[1]++;
2675         if(i[0] < len){
2676             in[ i[0] ]->pic_id= is_long ? i[0] : in[ i[0] ]->frame_num;
2677             split_field_copy(&def[index++], in[ i[0]++ ], sel  , 1);
2678         }
2679         if(i[1] < len){
2680             in[ i[1] ]->pic_id= is_long ? i[1] : in[ i[1] ]->frame_num;
2681             split_field_copy(&def[index++], in[ i[1]++ ], sel^3, 0);
2682         }
2683     }
2684
2685     return index;
2686 }
2687
2688 static int add_sorted(Picture **sorted, Picture **src, int len, int limit, int dir){
2689     int i, best_poc;
2690     int out_i= 0;
2691
2692     for(;;){
2693         best_poc= dir ? INT_MIN : INT_MAX;
2694
2695         for(i=0; i<len; i++){
2696             const int poc= src[i]->poc;
2697             if(((poc > limit) ^ dir) && ((poc < best_poc) ^ dir)){
2698                 best_poc= poc;
2699                 sorted[out_i]= src[i];
2700             }
2701         }
2702         if(best_poc == (dir ? INT_MIN : INT_MAX))
2703             break;
2704         limit= sorted[out_i++]->poc - dir;
2705     }
2706     return out_i;
2707 }
2708
2709 /**
2710  * fills the default_ref_list.
2711  */
2712 static int fill_default_ref_list(H264Context *h){
2713     MpegEncContext * const s = &h->s;
2714     int i, len;
2715
2716     if(h->slice_type_nos==FF_B_TYPE){
2717         Picture *sorted[32];
2718         int cur_poc, list;
2719         int lens[2];
2720
2721         if(FIELD_PICTURE)
2722             cur_poc= s->current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
2723         else
2724             cur_poc= s->current_picture_ptr->poc;
2725
2726         for(list= 0; list<2; list++){
2727             len= add_sorted(sorted    , h->short_ref, h->short_ref_count, cur_poc, 1^list);
2728             len+=add_sorted(sorted+len, h->short_ref, h->short_ref_count, cur_poc, 0^list);
2729             assert(len<=32);
2730             len= build_def_list(h->default_ref_list[list]    , sorted     , len, 0, s->picture_structure);
2731             len+=build_def_list(h->default_ref_list[list]+len, h->long_ref, 16 , 1, s->picture_structure);
2732             assert(len<=32);
2733
2734             if(len < h->ref_count[list])
2735                 memset(&h->default_ref_list[list][len], 0, sizeof(Picture)*(h->ref_count[list] - len));
2736             lens[list]= len;
2737         }
2738
2739         if(lens[0] == lens[1] && lens[1] > 1){
2740             for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0] && i<lens[0]; i++);
2741             if(i == lens[0])
2742                 FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2743         }
2744     }else{
2745         len = build_def_list(h->default_ref_list[0]    , h->short_ref, h->short_ref_count, 0, s->picture_structure);
2746         len+= build_def_list(h->default_ref_list[0]+len, h-> long_ref, 16                , 1, s->picture_structure);
2747         assert(len <= 32);
2748         if(len < h->ref_count[0])
2749             memset(&h->default_ref_list[0][len], 0, sizeof(Picture)*(h->ref_count[0] - len));
2750     }
2751 #ifdef TRACE
2752     for (i=0; i<h->ref_count[0]; i++) {
2753         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2754     }
2755     if(h->slice_type_nos==FF_B_TYPE){
2756         for (i=0; i<h->ref_count[1]; i++) {
2757             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
2758         }
2759     }
2760 #endif
2761     return 0;
2762 }
2763
2764 static void print_short_term(H264Context *h);
2765 static void print_long_term(H264Context *h);
2766
2767 /**
2768  * Extract structure information about the picture described by pic_num in
2769  * the current decoding context (frame or field). Note that pic_num is
2770  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
2771  * @param pic_num picture number for which to extract structure information
2772  * @param structure one of PICT_XXX describing structure of picture
2773  *                      with pic_num
2774  * @return frame number (short term) or long term index of picture
2775  *         described by pic_num
2776  */
2777 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
2778     MpegEncContext * const s = &h->s;
2779
2780     *structure = s->picture_structure;
2781     if(FIELD_PICTURE){
2782         if (!(pic_num & 1))
2783             /* opposite field */
2784             *structure ^= PICT_FRAME;
2785         pic_num >>= 1;
2786     }
2787
2788     return pic_num;
2789 }
2790
2791 static int decode_ref_pic_list_reordering(H264Context *h){
2792     MpegEncContext * const s = &h->s;
2793     int list, index, pic_structure;
2794
2795     print_short_term(h);
2796     print_long_term(h);
2797
2798     for(list=0; list<h->list_count; list++){
2799         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
2800
2801         if(get_bits1(&s->gb)){
2802             int pred= h->curr_pic_num;
2803
2804             for(index=0; ; index++){
2805                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
2806                 unsigned int pic_id;
2807                 int i;
2808                 Picture *ref = NULL;
2809
2810                 if(reordering_of_pic_nums_idc==3)
2811                     break;
2812
2813                 if(index >= h->ref_count[list]){
2814                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
2815                     return -1;
2816                 }
2817
2818                 if(reordering_of_pic_nums_idc<3){
2819                     if(reordering_of_pic_nums_idc<2){
2820                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
2821                         int frame_num;
2822
2823                         if(abs_diff_pic_num > h->max_pic_num){
2824                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
2825                             return -1;
2826                         }
2827
2828                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
2829                         else                                pred+= abs_diff_pic_num;
2830                         pred &= h->max_pic_num - 1;
2831
2832                         frame_num = pic_num_extract(h, pred, &pic_structure);
2833
2834                         for(i= h->short_ref_count-1; i>=0; i--){
2835                             ref = h->short_ref[i];
2836                             assert(ref->reference);
2837                             assert(!ref->long_ref);
2838                             if(
2839                                    ref->frame_num == frame_num &&
2840                                    (ref->reference & pic_structure)
2841                               )
2842                                 break;
2843                         }
2844                         if(i>=0)
2845                             ref->pic_id= pred;
2846                     }else{
2847                         int long_idx;
2848                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
2849
2850                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
2851
2852                         if(long_idx>31){
2853                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
2854                             return -1;
2855                         }
2856                         ref = h->long_ref[long_idx];
2857                         assert(!(ref && !ref->reference));
2858                         if(ref && (ref->reference & pic_structure)){
2859                             ref->pic_id= pic_id;
2860                             assert(ref->long_ref);
2861                             i=0;
2862                         }else{
2863                             i=-1;
2864                         }
2865                     }
2866
2867                     if (i < 0) {
2868                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
2869                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
2870                     } else {
2871                         for(i=index; i+1<h->ref_count[list]; i++){
2872                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
2873                                 break;
2874                         }
2875                         for(; i > index; i--){
2876                             h->ref_list[list][i]= h->ref_list[list][i-1];
2877                         }
2878                         h->ref_list[list][index]= *ref;
2879                         if (FIELD_PICTURE){
2880                             pic_as_field(&h->ref_list[list][index], pic_structure);
2881                         }
2882                     }
2883                 }else{
2884                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
2885                     return -1;
2886                 }
2887             }
2888         }
2889     }
2890     for(list=0; list<h->list_count; list++){
2891         for(index= 0; index < h->ref_count[list]; index++){
2892             if(!h->ref_list[list][index].data[0]){
2893                 av_log(h->s.avctx, AV_LOG_ERROR, "Missing reference picture\n");
2894                 h->ref_list[list][index]= s->current_picture; //FIXME this is not a sensible solution
2895             }
2896         }
2897     }
2898
2899     return 0;
2900 }
2901
2902 static void fill_mbaff_ref_list(H264Context *h){
2903     int list, i, j;
2904     for(list=0; list<2; list++){ //FIXME try list_count
2905         for(i=0; i<h->ref_count[list]; i++){
2906             Picture *frame = &h->ref_list[list][i];
2907             Picture *field = &h->ref_list[list][16+2*i];
2908             field[0] = *frame;
2909             for(j=0; j<3; j++)
2910                 field[0].linesize[j] <<= 1;
2911             field[0].reference = PICT_TOP_FIELD;
2912             field[0].poc= field[0].field_poc[0];
2913             field[1] = field[0];
2914             for(j=0; j<3; j++)
2915                 field[1].data[j] += frame->linesize[j];
2916             field[1].reference = PICT_BOTTOM_FIELD;
2917             field[1].poc= field[1].field_poc[1];
2918
2919             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
2920             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
2921             for(j=0; j<2; j++){
2922                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
2923                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
2924             }
2925         }
2926     }
2927     for(j=0; j<h->ref_count[1]; j++){
2928         for(i=0; i<h->ref_count[0]; i++)
2929             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
2930         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
2931         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
2932     }
2933 }
2934
2935 static int pred_weight_table(H264Context *h){
2936     MpegEncContext * const s = &h->s;
2937     int list, i;
2938     int luma_def, chroma_def;
2939
2940     h->use_weight= 0;
2941     h->use_weight_chroma= 0;
2942     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
2943     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
2944     luma_def = 1<<h->luma_log2_weight_denom;
2945     chroma_def = 1<<h->chroma_log2_weight_denom;
2946
2947     for(list=0; list<2; list++){
2948         for(i=0; i<h->ref_count[list]; i++){
2949             int luma_weight_flag, chroma_weight_flag;
2950
2951             luma_weight_flag= get_bits1(&s->gb);
2952             if(luma_weight_flag){
2953                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
2954                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
2955                 if(   h->luma_weight[list][i] != luma_def
2956                    || h->luma_offset[list][i] != 0)
2957                     h->use_weight= 1;
2958             }else{
2959                 h->luma_weight[list][i]= luma_def;
2960                 h->luma_offset[list][i]= 0;
2961             }
2962
2963             if(CHROMA){
2964                 chroma_weight_flag= get_bits1(&s->gb);
2965                 if(chroma_weight_flag){
2966                     int j;
2967                     for(j=0; j<2; j++){
2968                         h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
2969                         h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
2970                         if(   h->chroma_weight[list][i][j] != chroma_def
2971                         || h->chroma_offset[list][i][j] != 0)
2972                             h->use_weight_chroma= 1;
2973                     }
2974                 }else{
2975                     int j;
2976                     for(j=0; j<2; j++){
2977                         h->chroma_weight[list][i][j]= chroma_def;
2978                         h->chroma_offset[list][i][j]= 0;
2979                     }
2980                 }
2981             }
2982         }
2983         if(h->slice_type_nos != FF_B_TYPE) break;
2984     }
2985     h->use_weight= h->use_weight || h->use_weight_chroma;
2986     return 0;
2987 }
2988
2989 static void implicit_weight_table(H264Context *h){
2990     MpegEncContext * const s = &h->s;
2991     int ref0, ref1;
2992     int cur_poc = s->current_picture_ptr->poc;
2993
2994     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
2995        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
2996         h->use_weight= 0;
2997         h->use_weight_chroma= 0;
2998         return;
2999     }
3000
3001     h->use_weight= 2;
3002     h->use_weight_chroma= 2;
3003     h->luma_log2_weight_denom= 5;
3004     h->chroma_log2_weight_denom= 5;
3005
3006     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3007         int poc0 = h->ref_list[0][ref0].poc;
3008         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3009             int poc1 = h->ref_list[1][ref1].poc;
3010             int td = av_clip(poc1 - poc0, -128, 127);
3011             if(td){
3012                 int tb = av_clip(cur_poc - poc0, -128, 127);
3013                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3014                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3015                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3016                     h->implicit_weight[ref0][ref1] = 32;
3017                 else
3018                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3019             }else
3020                 h->implicit_weight[ref0][ref1] = 32;
3021         }
3022     }
3023 }
3024
3025 /**
3026  * Mark a picture as no longer needed for reference. The refmask
3027  * argument allows unreferencing of individual fields or the whole frame.
3028  * If the picture becomes entirely unreferenced, but is being held for
3029  * display purposes, it is marked as such.
3030  * @param refmask mask of fields to unreference; the mask is bitwise
3031  *                anded with the reference marking of pic
3032  * @return non-zero if pic becomes entirely unreferenced (except possibly
3033  *         for display purposes) zero if one of the fields remains in
3034  *         reference
3035  */
3036 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3037     int i;
3038     if (pic->reference &= refmask) {
3039         return 0;
3040     } else {
3041         for(i = 0; h->delayed_pic[i]; i++)
3042             if(pic == h->delayed_pic[i]){
3043                 pic->reference=DELAYED_PIC_REF;
3044                 break;
3045             }
3046         return 1;
3047     }
3048 }
3049
3050 /**
3051  * instantaneous decoder refresh.
3052  */
3053 static void idr(H264Context *h){
3054     int i;
3055
3056     for(i=0; i<16; i++){
3057         remove_long(h, i, 0);
3058     }
3059     assert(h->long_ref_count==0);
3060
3061     for(i=0; i<h->short_ref_count; i++){
3062         unreference_pic(h, h->short_ref[i], 0);
3063         h->short_ref[i]= NULL;
3064     }
3065     h->short_ref_count=0;
3066     h->prev_frame_num= 0;
3067     h->prev_frame_num_offset= 0;
3068     h->prev_poc_msb=
3069     h->prev_poc_lsb= 0;
3070 }
3071
3072 /* forget old pics after a seek */
3073 static void flush_dpb(AVCodecContext *avctx){
3074     H264Context *h= avctx->priv_data;
3075     int i;
3076     for(i=0; i<MAX_DELAYED_PIC_COUNT; i++) {
3077         if(h->delayed_pic[i])
3078             h->delayed_pic[i]->reference= 0;
3079         h->delayed_pic[i]= NULL;
3080     }
3081     h->outputed_poc= INT_MIN;
3082     idr(h);
3083     if(h->s.current_picture_ptr)
3084         h->s.current_picture_ptr->reference= 0;
3085     h->s.first_field= 0;
3086     ff_mpeg_flush(avctx);
3087 }
3088
3089 /**
3090  * Find a Picture in the short term reference list by frame number.
3091  * @param frame_num frame number to search for
3092  * @param idx the index into h->short_ref where returned picture is found
3093  *            undefined if no picture found.
3094  * @return pointer to the found picture, or NULL if no pic with the provided
3095  *                 frame number is found
3096  */
3097 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3098     MpegEncContext * const s = &h->s;
3099     int i;
3100
3101     for(i=0; i<h->short_ref_count; i++){
3102         Picture *pic= h->short_ref[i];
3103         if(s->avctx->debug&FF_DEBUG_MMCO)
3104             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3105         if(pic->frame_num == frame_num) {
3106             *idx = i;
3107             return pic;
3108         }
3109     }
3110     return NULL;
3111 }
3112
3113 /**
3114  * Remove a picture from the short term reference list by its index in
3115  * that list.  This does no checking on the provided index; it is assumed
3116  * to be valid. Other list entries are shifted down.
3117  * @param i index into h->short_ref of picture to remove.
3118  */
3119 static void remove_short_at_index(H264Context *h, int i){
3120     assert(i >= 0 && i < h->short_ref_count);
3121     h->short_ref[i]= NULL;
3122     if (--h->short_ref_count)
3123         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3124 }
3125
3126 /**
3127  *
3128  * @return the removed picture or NULL if an error occurs
3129  */
3130 static Picture * remove_short(H264Context *h, int frame_num, int ref_mask){
3131     MpegEncContext * const s = &h->s;
3132     Picture *pic;
3133     int i;
3134
3135     if(s->avctx->debug&FF_DEBUG_MMCO)
3136         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3137
3138     pic = find_short(h, frame_num, &i);
3139     if (pic){
3140         if(unreference_pic(h, pic, ref_mask))
3141         remove_short_at_index(h, i);
3142     }
3143
3144     return pic;
3145 }
3146
3147 /**
3148  * Remove a picture from the long term reference list by its index in
3149  * that list.
3150  * @return the removed picture or NULL if an error occurs
3151  */
3152 static Picture * remove_long(H264Context *h, int i, int ref_mask){
3153     Picture *pic;
3154
3155     pic= h->long_ref[i];
3156     if (pic){
3157         if(unreference_pic(h, pic, ref_mask)){
3158             assert(h->long_ref[i]->long_ref == 1);
3159             h->long_ref[i]->long_ref= 0;
3160             h->long_ref[i]= NULL;
3161             h->long_ref_count--;
3162         }
3163     }
3164
3165     return pic;
3166 }
3167
3168 /**
3169  * print short term list
3170  */
3171 static void print_short_term(H264Context *h) {
3172     uint32_t i;
3173     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3174         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3175         for(i=0; i<h->short_ref_count; i++){
3176             Picture *pic= h->short_ref[i];
3177             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3178         }
3179     }
3180 }
3181
3182 /**
3183  * print long term list
3184  */
3185 static void print_long_term(H264Context *h) {
3186     uint32_t i;
3187     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3188         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3189         for(i = 0; i < 16; i++){
3190             Picture *pic= h->long_ref[i];
3191             if (pic) {
3192                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3193             }
3194         }
3195     }
3196 }
3197
3198 /**
3199  * Executes the reference picture marking (memory management control operations).
3200  */
3201 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3202     MpegEncContext * const s = &h->s;
3203     int i, j;
3204     int current_ref_assigned=0;
3205     Picture *pic;
3206
3207     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3208         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3209
3210     for(i=0; i<mmco_count; i++){
3211         int structure, frame_num;
3212         if(s->avctx->debug&FF_DEBUG_MMCO)
3213             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3214
3215         if(   mmco[i].opcode == MMCO_SHORT2UNUSED
3216            || mmco[i].opcode == MMCO_SHORT2LONG){
3217             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3218             pic = find_short(h, frame_num, &j);
3219             if(!pic){
3220                 if(mmco[i].opcode != MMCO_SHORT2LONG || !h->long_ref[mmco[i].long_arg]
3221                    || h->long_ref[mmco[i].long_arg]->frame_num != frame_num)
3222                 av_log(h->s.avctx, AV_LOG_ERROR, "mmco: unref short failure\n");
3223                 continue;
3224             }
3225         }
3226
3227         switch(mmco[i].opcode){
3228         case MMCO_SHORT2UNUSED:
3229             if(s->avctx->debug&FF_DEBUG_MMCO)
3230                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3231             remove_short(h, frame_num, structure ^ PICT_FRAME);
3232             break;
3233         case MMCO_SHORT2LONG:
3234                 if (h->long_ref[mmco[i].long_arg] != pic)
3235                     remove_long(h, mmco[i].long_arg, 0);
3236
3237                 remove_short_at_index(h, j);
3238                 h->long_ref[ mmco[i].long_arg ]= pic;
3239                 if (h->long_ref[ mmco[i].long_arg ]){
3240                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3241                     h->long_ref_count++;
3242                 }
3243             break;
3244         case MMCO_LONG2UNUSED:
3245             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3246             pic = h->long_ref[j];
3247             if (pic) {
3248                 remove_long(h, j, structure ^ PICT_FRAME);
3249             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3250                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3251             break;
3252         case MMCO_LONG:
3253                     // Comment below left from previous code as it is an interresting note.
3254                     /* First field in pair is in short term list or
3255                      * at a different long term index.
3256                      * This is not allowed; see 7.4.3.3, notes 2 and 3.
3257                      * Report the problem and keep the pair where it is,
3258                      * and mark this field valid.
3259                      */
3260
3261             if (h->long_ref[mmco[i].long_arg] != s->current_picture_ptr) {
3262                 remove_long(h, mmco[i].long_arg, 0);
3263
3264                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3265                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3266                 h->long_ref_count++;
3267             }
3268
3269             s->current_picture_ptr->reference |= s->picture_structure;
3270             current_ref_assigned=1;
3271             break;
3272         case MMCO_SET_MAX_LONG:
3273             assert(mmco[i].long_arg <= 16);
3274             // just remove the long term which index is greater than new max
3275             for(j = mmco[i].long_arg; j<16; j++){
3276                 remove_long(h, j, 0);
3277             }
3278             break;
3279         case MMCO_RESET:
3280             while(h->short_ref_count){
3281                 remove_short(h, h->short_ref[0]->frame_num, 0);
3282             }
3283             for(j = 0; j < 16; j++) {
3284                 remove_long(h, j, 0);
3285             }
3286             s->current_picture_ptr->poc=
3287             s->current_picture_ptr->field_poc[0]=
3288             s->current_picture_ptr->field_poc[1]=
3289             h->poc_lsb=
3290             h->poc_msb=
3291             h->frame_num=
3292             s->current_picture_ptr->frame_num= 0;
3293             break;
3294         default: assert(0);
3295         }
3296     }
3297
3298     if (!current_ref_assigned) {
3299         /* Second field of complementary field pair; the first field of
3300          * which is already referenced. If short referenced, it
3301          * should be first entry in short_ref. If not, it must exist
3302          * in long_ref; trying to put it on the short list here is an
3303          * error in the encoded bit stream (ref: 7.4.3.3, NOTE 2 and 3).
3304          */
3305         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3306             /* Just mark the second field valid */
3307             s->current_picture_ptr->reference = PICT_FRAME;
3308         } else if (s->current_picture_ptr->long_ref) {
3309             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3310                                              "assignment for second field "
3311                                              "in complementary field pair "
3312                                              "(first field is long term)\n");
3313         } else {
3314             pic= remove_short(h, s->current_picture_ptr->frame_num, 0);
3315             if(pic){
3316                 av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3317             }
3318
3319             if(h->short_ref_count)
3320                 memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3321
3322             h->short_ref[0]= s->current_picture_ptr;
3323             h->short_ref_count++;
3324             s->current_picture_ptr->reference |= s->picture_structure;
3325         }
3326     }
3327
3328     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3329
3330         /* We have too many reference frames, probably due to corrupted
3331          * stream. Need to discard one frame. Prevents overrun of the
3332          * short_ref and long_ref buffers.
3333          */
3334         av_log(h->s.avctx, AV_LOG_ERROR,
3335                "number of reference frames exceeds max (probably "
3336                "corrupt input), discarding one\n");
3337
3338         if (h->long_ref_count && !h->short_ref_count) {
3339             for (i = 0; i < 16; ++i)
3340                 if (h->long_ref[i])
3341                     break;
3342
3343             assert(i < 16);
3344             remove_long(h, i, 0);
3345         } else {
3346             pic = h->short_ref[h->short_ref_count - 1];
3347             remove_short(h, pic->frame_num, 0);
3348         }
3349     }
3350
3351     print_short_term(h);
3352     print_long_term(h);
3353     return 0;
3354 }
3355
3356 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3357     MpegEncContext * const s = &h->s;
3358     int i;
3359
3360     h->mmco_index= 0;
3361     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3362         s->broken_link= get_bits1(gb) -1;
3363         if(get_bits1(gb)){
3364             h->mmco[0].opcode= MMCO_LONG;
3365             h->mmco[0].long_arg= 0;
3366             h->mmco_index= 1;
3367         }
3368     }else{
3369         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3370             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3371                 MMCOOpcode opcode= get_ue_golomb(gb);
3372
3373                 h->mmco[i].opcode= opcode;
3374                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3375                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3376 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3377                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3378                         return -1;
3379                     }*/
3380                 }
3381                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3382                     unsigned int long_arg= get_ue_golomb(gb);
3383                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3384                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3385                         return -1;
3386                     }
3387                     h->mmco[i].long_arg= long_arg;
3388                 }
3389
3390                 if(opcode > (unsigned)MMCO_LONG){
3391                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3392                     return -1;
3393                 }
3394                 if(opcode == MMCO_END)
3395                     break;
3396             }
3397             h->mmco_index= i;
3398         }else{
3399             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3400
3401             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3402                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3403                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3404                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3405                 h->mmco_index= 1;
3406                 if (FIELD_PICTURE) {
3407                     h->mmco[0].short_pic_num *= 2;
3408                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3409                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3410                     h->mmco_index= 2;
3411                 }
3412             }
3413         }
3414     }
3415
3416     return 0;
3417 }
3418
3419 static int init_poc(H264Context *h){
3420     MpegEncContext * const s = &h->s;
3421     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3422     int field_poc[2];
3423     Picture *cur = s->current_picture_ptr;
3424
3425     h->frame_num_offset= h->prev_frame_num_offset;
3426     if(h->frame_num < h->prev_frame_num)
3427         h->frame_num_offset += max_frame_num;
3428
3429     if(h->sps.poc_type==0){
3430         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3431
3432         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3433             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3434         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3435             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3436         else
3437             h->poc_msb = h->prev_poc_msb;
3438 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3439         field_poc[0] =
3440         field_poc[1] = h->poc_msb + h->poc_lsb;
3441         if(s->picture_structure == PICT_FRAME)
3442             field_poc[1] += h->delta_poc_bottom;
3443     }else if(h->sps.poc_type==1){
3444         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3445         int i;
3446
3447         if(h->sps.poc_cycle_length != 0)
3448             abs_frame_num = h->frame_num_offset + h->frame_num;
3449         else
3450             abs_frame_num = 0;
3451
3452         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3453             abs_frame_num--;
3454
3455         expected_delta_per_poc_cycle = 0;
3456         for(i=0; i < h->sps.poc_cycle_length; i++)
3457             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3458
3459         if(abs_frame_num > 0){
3460             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3461             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3462
3463             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3464             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3465                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3466         } else
3467             expectedpoc = 0;
3468
3469         if(h->nal_ref_idc == 0)
3470             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3471
3472         field_poc[0] = expectedpoc + h->delta_poc[0];
3473         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3474
3475         if(s->picture_structure == PICT_FRAME)
3476             field_poc[1] += h->delta_poc[1];
3477     }else{
3478         int poc= 2*(h->frame_num_offset + h->frame_num);
3479
3480         if(!h->nal_ref_idc)
3481             poc--;
3482
3483         field_poc[0]= poc;
3484         field_poc[1]= poc;
3485     }
3486
3487     if(s->picture_structure != PICT_BOTTOM_FIELD)
3488         s->current_picture_ptr->field_poc[0]= field_poc[0];
3489     if(s->picture_structure != PICT_TOP_FIELD)
3490         s->current_picture_ptr->field_poc[1]= field_poc[1];
3491     cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3492
3493     return 0;
3494 }
3495
3496
3497 /**
3498  * initialize scan tables
3499  */
3500 static void init_scan_tables(H264Context *h){
3501     MpegEncContext * const s = &h->s;
3502     int i;
3503     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3504         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3505         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3506     }else{
3507         for(i=0; i<16; i++){
3508 #define T(x) (x>>2) | ((x<<2) & 0xF)
3509             h->zigzag_scan[i] = T(zigzag_scan[i]);
3510             h-> field_scan[i] = T( field_scan[i]);
3511 #undef T
3512         }
3513     }
3514     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3515         memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
3516         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3517         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3518         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3519     }else{
3520         for(i=0; i<64; i++){
3521 #define T(x) (x>>3) | ((x&7)<<3)
3522             h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
3523             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3524             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3525             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3526 #undef T
3527         }
3528     }
3529     if(h->sps.transform_bypass){ //FIXME same ugly
3530         h->zigzag_scan_q0          = zigzag_scan;
3531         h->zigzag_scan8x8_q0       = zigzag_scan8x8;
3532         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3533         h->field_scan_q0           = field_scan;
3534         h->field_scan8x8_q0        = field_scan8x8;
3535         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3536     }else{
3537         h->zigzag_scan_q0          = h->zigzag_scan;
3538         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3539         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3540         h->field_scan_q0           = h->field_scan;
3541         h->field_scan8x8_q0        = h->field_scan8x8;
3542         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3543     }
3544 }
3545
3546 /**
3547  * Replicates H264 "master" context to thread contexts.
3548  */
3549 static void clone_slice(H264Context *dst, H264Context *src)
3550 {
3551     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3552     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3553     dst->s.current_picture      = src->s.current_picture;
3554     dst->s.linesize             = src->s.linesize;
3555     dst->s.uvlinesize           = src->s.uvlinesize;
3556     dst->s.first_field          = src->s.first_field;
3557
3558     dst->prev_poc_msb           = src->prev_poc_msb;
3559     dst->prev_poc_lsb           = src->prev_poc_lsb;
3560     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3561     dst->prev_frame_num         = src->prev_frame_num;
3562     dst->short_ref_count        = src->short_ref_count;
3563
3564     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3565     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3566     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3567     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3568
3569     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3570     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3571 }
3572
3573 /**
3574  * decodes a slice header.
3575  * This will also call MPV_common_init() and frame_start() as needed.
3576  *
3577  * @param h h264context
3578  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3579  *
3580  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
3581  */
3582 static int decode_slice_header(H264Context *h, H264Context *h0){
3583     MpegEncContext * const s = &h->s;
3584     MpegEncContext * const s0 = &h0->s;
3585     unsigned int first_mb_in_slice;
3586     unsigned int pps_id;
3587     int num_ref_idx_active_override_flag;
3588     unsigned int slice_type, tmp, i, j;
3589     int default_ref_list_done = 0;
3590     int last_pic_structure;
3591
3592     s->dropable= h->nal_ref_idc == 0;
3593
3594     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3595         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3596         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3597     }else{
3598         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3599         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3600     }
3601
3602     first_mb_in_slice= get_ue_golomb(&s->gb);
3603
3604     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3605         h0->current_slice = 0;
3606         if (!s0->first_field)
3607             s->current_picture_ptr= NULL;
3608     }
3609
3610     slice_type= get_ue_golomb(&s->gb);
3611     if(slice_type > 9){
3612         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3613         return -1;
3614     }
3615     if(slice_type > 4){
3616         slice_type -= 5;
3617         h->slice_type_fixed=1;
3618     }else
3619         h->slice_type_fixed=0;
3620
3621     slice_type= golomb_to_pict_type[ slice_type ];
3622     if (slice_type == FF_I_TYPE
3623         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3624         default_ref_list_done = 1;
3625     }
3626     h->slice_type= slice_type;
3627     h->slice_type_nos= slice_type & 3;
3628
3629     s->pict_type= h->slice_type; // to make a few old functions happy, it's wrong though
3630     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3631         av_log(h->s.avctx, AV_LOG_ERROR,
3632                "B picture before any references, skipping\n");
3633         return -1;
3634     }
3635
3636     pps_id= get_ue_golomb(&s->gb);
3637     if(pps_id>=MAX_PPS_COUNT){
3638         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3639         return -1;
3640     }
3641     if(!h0->pps_buffers[pps_id]) {
3642         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing PPS referenced\n");
3643         return -1;
3644     }
3645     h->pps= *h0->pps_buffers[pps_id];
3646
3647     if(!h0->sps_buffers[h->pps.sps_id]) {
3648         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS referenced\n");
3649         return -1;
3650     }
3651     h->sps = *h0->sps_buffers[h->pps.sps_id];
3652
3653     if(h == h0 && h->dequant_coeff_pps != pps_id){
3654         h->dequant_coeff_pps = pps_id;
3655         init_dequant_tables(h);
3656     }
3657
3658     s->mb_width= h->sps.mb_width;
3659     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3660
3661     h->b_stride=  s->mb_width*4;
3662     h->b8_stride= s->mb_width*2;
3663
3664     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
3665     if(h->sps.frame_mbs_only_flag)
3666         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
3667     else
3668         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 3);
3669
3670     if (s->context_initialized
3671         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3672         if(h != h0)
3673             return -1;   // width / height changed during parallelized decoding
3674         free_tables(h);
3675         flush_dpb(s->avctx);
3676         MPV_common_end(s);
3677     }
3678     if (!s->context_initialized) {
3679         if(h != h0)
3680             return -1;  // we cant (re-)initialize context during parallel decoding
3681         if (MPV_common_init(s) < 0)
3682             return -1;
3683         s->first_field = 0;
3684
3685         init_scan_tables(h);
3686         alloc_tables(h);
3687
3688         for(i = 1; i < s->avctx->thread_count; i++) {
3689             H264Context *c;
3690             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3691             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3692             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3693             c->sps = h->sps;
3694             c->pps = h->pps;
3695             init_scan_tables(c);
3696             clone_tables(c, h);
3697         }
3698
3699         for(i = 0; i < s->avctx->thread_count; i++)
3700             if(context_init(h->thread_context[i]) < 0)
3701                 return -1;
3702
3703         s->avctx->width = s->width;
3704         s->avctx->height = s->height;
3705         s->avctx->sample_aspect_ratio= h->sps.sar;
3706         if(!s->avctx->sample_aspect_ratio.den)
3707             s->avctx->sample_aspect_ratio.den = 1;
3708
3709         if(h->sps.timing_info_present_flag){
3710             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
3711             if(h->x264_build > 0 && h->x264_build < 44)
3712                 s->avctx->time_base.den *= 2;
3713             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3714                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3715         }
3716     }
3717
3718     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3719
3720     h->mb_mbaff = 0;
3721     h->mb_aff_frame = 0;
3722     last_pic_structure = s0->picture_structure;
3723     if(h->sps.frame_mbs_only_flag){
3724         s->picture_structure= PICT_FRAME;
3725     }else{
3726         if(get_bits1(&s->gb)) { //field_pic_flag
3727             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3728         } else {
3729             s->picture_structure= PICT_FRAME;
3730             h->mb_aff_frame = h->sps.mb_aff;
3731         }
3732     }
3733     h->mb_field_decoding_flag= s->picture_structure != PICT_FRAME;
3734
3735     if(h0->current_slice == 0){
3736         while(h->frame_num !=  h->prev_frame_num &&
3737               h->frame_num != (h->prev_frame_num+1)%(1<<h->sps.log2_max_frame_num)){
3738             av_log(NULL, AV_LOG_DEBUG, "Frame num gap %d %d\n", h->frame_num, h->prev_frame_num);
3739             frame_start(h);
3740             h->prev_frame_num++;
3741             h->prev_frame_num %= 1<<h->sps.log2_max_frame_num;
3742             s->current_picture_ptr->frame_num= h->prev_frame_num;
3743             execute_ref_pic_marking(h, NULL, 0);
3744         }
3745
3746         /* See if we have a decoded first field looking for a pair... */
3747         if (s0->first_field) {
3748             assert(s0->current_picture_ptr);
3749             assert(s0->current_picture_ptr->data[0]);
3750             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
3751
3752             /* figure out if we have a complementary field pair */
3753             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
3754                 /*
3755                  * Previous field is unmatched. Don't display it, but let it
3756                  * remain for reference if marked as such.
3757                  */
3758                 s0->current_picture_ptr = NULL;
3759                 s0->first_field = FIELD_PICTURE;
3760
3761             } else {
3762                 if (h->nal_ref_idc &&
3763                         s0->current_picture_ptr->reference &&
3764                         s0->current_picture_ptr->frame_num != h->frame_num) {
3765                     /*
3766                      * This and previous field were reference, but had
3767                      * different frame_nums. Consider this field first in
3768                      * pair. Throw away previous field except for reference
3769                      * purposes.
3770                      */
3771                     s0->first_field = 1;
3772                     s0->current_picture_ptr = NULL;
3773
3774                 } else {
3775                     /* Second field in complementary pair */
3776                     s0->first_field = 0;
3777                 }
3778             }
3779
3780         } else {
3781             /* Frame or first field in a potentially complementary pair */
3782             assert(!s0->current_picture_ptr);
3783             s0->first_field = FIELD_PICTURE;
3784         }
3785
3786         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
3787             s0->first_field = 0;
3788             return -1;
3789         }
3790     }
3791     if(h != h0)
3792         clone_slice(h, h0);
3793
3794     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
3795
3796     assert(s->mb_num == s->mb_width * s->mb_height);
3797     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
3798        first_mb_in_slice                    >= s->mb_num){
3799         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
3800         return -1;
3801     }
3802     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
3803     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
3804     if (s->picture_structure == PICT_BOTTOM_FIELD)
3805         s->resync_mb_y = s->mb_y = s->mb_y + 1;
3806     assert(s->mb_y < s->mb_height);
3807
3808     if(s->picture_structure==PICT_FRAME){
3809         h->curr_pic_num=   h->frame_num;
3810         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
3811     }else{
3812         h->curr_pic_num= 2*h->frame_num + 1;
3813         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
3814     }
3815
3816     if(h->nal_unit_type == NAL_IDR_SLICE){
3817         get_ue_golomb(&s->gb); /* idr_pic_id */
3818     }
3819
3820     if(h->sps.poc_type==0){
3821         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
3822
3823         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
3824             h->delta_poc_bottom= get_se_golomb(&s->gb);
3825         }
3826     }
3827
3828     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
3829         h->delta_poc[0]= get_se_golomb(&s->gb);
3830
3831         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
3832             h->delta_poc[1]= get_se_golomb(&s->gb);
3833     }
3834
3835     init_poc(h);
3836
3837     if(h->pps.redundant_pic_cnt_present){
3838         h->redundant_pic_count= get_ue_golomb(&s->gb);
3839     }
3840
3841     //set defaults, might be overridden a few lines later
3842     h->ref_count[0]= h->pps.ref_count[0];
3843     h->ref_count[1]= h->pps.ref_count[1];
3844
3845     if(h->slice_type_nos != FF_I_TYPE){
3846         if(h->slice_type_nos == FF_B_TYPE){
3847             h->direct_spatial_mv_pred= get_bits1(&s->gb);
3848         }
3849         num_ref_idx_active_override_flag= get_bits1(&s->gb);
3850
3851         if(num_ref_idx_active_override_flag){
3852             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
3853             if(h->slice_type_nos==FF_B_TYPE)
3854                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
3855
3856             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
3857                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
3858                 h->ref_count[0]= h->ref_count[1]= 1;
3859                 return -1;
3860             }
3861         }
3862         if(h->slice_type_nos == FF_B_TYPE)
3863             h->list_count= 2;
3864         else
3865             h->list_count= 1;
3866     }else
3867         h->list_count= 0;
3868
3869     if(!default_ref_list_done){
3870         fill_default_ref_list(h);
3871     }
3872
3873     if(h->slice_type_nos!=FF_I_TYPE && decode_ref_pic_list_reordering(h) < 0)
3874         return -1;
3875
3876     if(h->slice_type_nos!=FF_I_TYPE){
3877         s->last_picture_ptr= &h->ref_list[0][0];
3878         ff_copy_picture(&s->last_picture, s->last_picture_ptr);
3879     }
3880     if(h->slice_type_nos==FF_B_TYPE){
3881         s->next_picture_ptr= &h->ref_list[1][0];
3882         ff_copy_picture(&s->next_picture, s->next_picture_ptr);
3883     }
3884
3885     if(   (h->pps.weighted_pred          && h->slice_type_nos == FF_P_TYPE )
3886        ||  (h->pps.weighted_bipred_idc==1 && h->slice_type_nos== FF_B_TYPE ) )
3887         pred_weight_table(h);
3888     else if(h->pps.weighted_bipred_idc==2 && h->slice_type_nos== FF_B_TYPE)
3889         implicit_weight_table(h);
3890     else
3891         h->use_weight = 0;
3892
3893     if(h->nal_ref_idc)
3894         decode_ref_pic_marking(h0, &s->gb);
3895
3896     if(FRAME_MBAFF)
3897         fill_mbaff_ref_list(h);
3898
3899     if(h->slice_type_nos==FF_B_TYPE && !h->direct_spatial_mv_pred)
3900         direct_dist_scale_factor(h);
3901     direct_ref_list_init(h);
3902
3903     if( h->slice_type_nos != FF_I_TYPE && h->pps.cabac ){
3904         tmp = get_ue_golomb(&s->gb);
3905         if(tmp > 2){
3906             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
3907             return -1;
3908         }
3909         h->cabac_init_idc= tmp;
3910     }
3911
3912     h->last_qscale_diff = 0;
3913     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
3914     if(tmp>51){
3915         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
3916         return -1;
3917     }
3918     s->qscale= tmp;
3919     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
3920     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
3921     //FIXME qscale / qp ... stuff
3922     if(h->slice_type == FF_SP_TYPE){
3923         get_bits1(&s->gb); /* sp_for_switch_flag */
3924     }
3925     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
3926         get_se_golomb(&s->gb); /* slice_qs_delta */
3927     }
3928
3929     h->deblocking_filter = 1;
3930     h->slice_alpha_c0_offset = 0;
3931     h->slice_beta_offset = 0;
3932     if( h->pps.deblocking_filter_parameters_present ) {
3933         tmp= get_ue_golomb(&s->gb);
3934         if(tmp > 2){
3935             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
3936             return -1;
3937         }
3938         h->deblocking_filter= tmp;
3939         if(h->deblocking_filter < 2)
3940             h->deblocking_filter^= 1; // 1<->0
3941
3942         if( h->deblocking_filter ) {
3943             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
3944             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
3945         }
3946     }
3947
3948     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
3949        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type_nos != FF_I_TYPE)
3950        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type_nos == FF_B_TYPE)
3951        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
3952         h->deblocking_filter= 0;
3953
3954     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
3955         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
3956             /* Cheat slightly for speed:
3957                Do not bother to deblock across slices. */
3958             h->deblocking_filter = 2;
3959         } else {
3960             h0->max_contexts = 1;
3961             if(!h0->single_decode_warning) {
3962                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
3963                 h0->single_decode_warning = 1;
3964             }
3965             if(h != h0)
3966                 return 1; // deblocking switched inside frame
3967         }
3968     }
3969
3970 #if 0 //FMO
3971     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
3972         slice_group_change_cycle= get_bits(&s->gb, ?);
3973 #endif
3974
3975     h0->last_slice_type = slice_type;
3976     h->slice_num = ++h0->current_slice;
3977     if(h->slice_num >= MAX_SLICES){
3978         av_log(s->avctx, AV_LOG_ERROR, "Too many slices, increase MAX_SLICES and recompile\n");
3979     }
3980
3981     for(j=0; j<2; j++){
3982         int *ref2frm= h->ref2frm[h->slice_num&(MAX_SLICES-1)][j];
3983         ref2frm[0]=
3984         ref2frm[1]= -1;
3985         for(i=0; i<16; i++)
3986             ref2frm[i+2]= 4*h->ref_list[j][i].frame_num
3987                           +(h->ref_list[j][i].reference&3);
3988         ref2frm[18+0]=
3989         ref2frm[18+1]= -1;
3990         for(i=16; i<48; i++)
3991             ref2frm[i+4]= 4*h->ref_list[j][i].frame_num
3992                           +(h->ref_list[j][i].reference&3);
3993     }
3994
3995     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
3996     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
3997
3998     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
3999         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c%s%s pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
4000                h->slice_num,
4001                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4002                first_mb_in_slice,
4003                av_get_pict_type_char(h->slice_type), h->slice_type_fixed ? " fix" : "", h->nal_unit_type == NAL_IDR_SLICE ? " IDR" : "",
4004                pps_id, h->frame_num,
4005                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4006                h->ref_count[0], h->ref_count[1],
4007                s->qscale,
4008                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4009                h->use_weight,
4010                h->use_weight==1 && h->use_weight_chroma ? "c" : "",
4011                h->slice_type == FF_B_TYPE ? (h->direct_spatial_mv_pred ? "SPAT" : "TEMP") : ""
4012                );
4013     }
4014
4015     return 0;
4016 }
4017
4018 /**
4019  *
4020  */
4021 static inline int get_level_prefix(GetBitContext *gb){
4022     unsigned int buf;
4023     int log;
4024
4025     OPEN_READER(re, gb);
4026     UPDATE_CACHE(re, gb);
4027     buf=GET_CACHE(re, gb);
4028
4029     log= 32 - av_log2(buf);
4030 #ifdef TRACE
4031     print_bin(buf>>(32-log), log);
4032     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4033 #endif
4034
4035     LAST_SKIP_BITS(re, gb, log);
4036     CLOSE_READER(re, gb);
4037
4038     return log-1;
4039 }
4040
4041 static inline int get_dct8x8_allowed(H264Context *h){
4042     int i;
4043     for(i=0; i<4; i++){
4044         if(!IS_SUB_8X8(h->sub_mb_type[i])
4045            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4046             return 0;
4047     }
4048     return 1;
4049 }
4050
4051 /**
4052  * decodes a residual block.
4053  * @param n block index
4054  * @param scantable scantable
4055  * @param max_coeff number of coefficients in the block
4056  * @return <0 if an error occurred
4057  */
4058 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4059     MpegEncContext * const s = &h->s;
4060     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4061     int level[16];
4062     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4063
4064     //FIXME put trailing_onex into the context
4065
4066     if(n == CHROMA_DC_BLOCK_INDEX){
4067         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4068         total_coeff= coeff_token>>2;
4069     }else{
4070         if(n == LUMA_DC_BLOCK_INDEX){
4071             total_coeff= pred_non_zero_count(h, 0);
4072             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4073             total_coeff= coeff_token>>2;
4074         }else{
4075             total_coeff= pred_non_zero_count(h, n);
4076             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4077             total_coeff= coeff_token>>2;
4078             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4079         }
4080     }
4081
4082     //FIXME set last_non_zero?
4083
4084     if(total_coeff==0)
4085         return 0;
4086     if(total_coeff > (unsigned)max_coeff) {
4087         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4088         return -1;
4089     }
4090
4091     trailing_ones= coeff_token&3;
4092     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4093     assert(total_coeff<=16);
4094
4095     i = show_bits(gb, 3);
4096     skip_bits(gb, trailing_ones);
4097     level[0] = 1-((i&4)>>1);
4098     level[1] = 1-((i&2)   );
4099     level[2] = 1-((i&1)<<1);
4100
4101     if(trailing_ones<total_coeff) {
4102         int level_code, mask;
4103         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4104         int prefix= get_level_prefix(gb);
4105
4106         //first coefficient has suffix_length equal to 0 or 1
4107         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4108             if(suffix_length)
4109                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4110             else
4111                 level_code= (prefix<<suffix_length); //part
4112         }else if(prefix==14){
4113             if(suffix_length)
4114                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4115             else
4116                 level_code= prefix + get_bits(gb, 4); //part
4117         }else{
4118             level_code= (15<<suffix_length) + get_bits(gb, prefix-3); //part
4119             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4120             if(prefix>=16)
4121                 level_code += (1<<(prefix-3))-4096;
4122         }
4123
4124         if(trailing_ones < 3) level_code += 2;
4125
4126         suffix_length = 1;
4127         if(level_code > 5)
4128             suffix_length++;
4129         mask= -(level_code&1);
4130         level[trailing_ones]= (((2+level_code)>>1) ^ mask) - mask;
4131
4132         //remaining coefficients have suffix_length > 0
4133         for(i=trailing_ones+1;i<total_coeff;i++) {
4134             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
4135             prefix = get_level_prefix(gb);
4136             if(prefix<15){
4137                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4138             }else{
4139                 level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
4140                 if(prefix>=16)
4141                     level_code += (1<<(prefix-3))-4096;
4142             }
4143             mask= -(level_code&1);
4144             level[i]= (((2+level_code)>>1) ^ mask) - mask;
4145             if(level_code > suffix_limit[suffix_length])
4146                 suffix_length++;
4147         }
4148     }
4149
4150     if(total_coeff == max_coeff)
4151         zeros_left=0;
4152     else{
4153         if(n == CHROMA_DC_BLOCK_INDEX)
4154             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4155         else
4156             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4157     }
4158
4159     coeff_num = zeros_left + total_coeff - 1;
4160     j = scantable[coeff_num];
4161     if(n > 24){
4162         block[j] = level[0];
4163         for(i=1;i<total_coeff;i++) {
4164             if(zeros_left <= 0)
4165                 run_before = 0;
4166             else if(zeros_left < 7){
4167                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4168             }else{
4169                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4170             }
4171             zeros_left -= run_before;
4172             coeff_num -= 1 + run_before;
4173             j= scantable[ coeff_num ];
4174
4175             block[j]= level[i];
4176         }
4177     }else{
4178         block[j] = (level[0] * qmul[j] + 32)>>6;
4179         for(i=1;i<total_coeff;i++) {
4180             if(zeros_left <= 0)
4181                 run_before = 0;
4182             else if(zeros_left < 7){
4183                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4184             }else{
4185                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4186             }
4187             zeros_left -= run_before;
4188             coeff_num -= 1 + run_before;
4189             j= scantable[ coeff_num ];
4190
4191             block[j]= (level[i] * qmul[j] + 32)>>6;
4192         }
4193     }
4194
4195     if(zeros_left<0){
4196         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4197         return -1;
4198     }
4199
4200     return 0;
4201 }
4202
4203 static void predict_field_decoding_flag(H264Context *h){
4204     MpegEncContext * const s = &h->s;
4205     const int mb_xy= h->mb_xy;
4206     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4207                 ? s->current_picture.mb_type[mb_xy-1]
4208                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4209                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4210                 : 0;
4211     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4212 }
4213
4214 /**
4215  * decodes a P_SKIP or B_SKIP macroblock
4216  */
4217 static void decode_mb_skip(H264Context *h){
4218     MpegEncContext * const s = &h->s;
4219     const int mb_xy= h->mb_xy;
4220     int mb_type=0;
4221
4222     memset(h->non_zero_count[mb_xy], 0, 16);
4223     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4224
4225     if(MB_FIELD)
4226         mb_type|= MB_TYPE_INTERLACED;
4227
4228     if( h->slice_type_nos == FF_B_TYPE )
4229     {
4230         // just for fill_caches. pred_direct_motion will set the real mb_type
4231         mb_type|= MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4232
4233         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4234         pred_direct_motion(h, &mb_type);
4235         mb_type|= MB_TYPE_SKIP;
4236     }
4237     else
4238     {
4239         int mx, my;
4240         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4241
4242         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4243         pred_pskip_motion(h, &mx, &my);
4244         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4245         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4246     }
4247
4248     write_back_motion(h, mb_type);
4249     s->current_picture.mb_type[mb_xy]= mb_type;
4250     s->current_picture.qscale_table[mb_xy]= s->qscale;
4251     h->slice_table[ mb_xy ]= h->slice_num;
4252     h->prev_mb_skipped= 1;
4253 }
4254
4255 /**
4256  * decodes a macroblock
4257  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4258  */
4259 static int decode_mb_cavlc(H264Context *h){
4260     MpegEncContext * const s = &h->s;
4261     int mb_xy;
4262     int partition_count;
4263     unsigned int mb_type, cbp;
4264     int dct8x8_allowed= h->pps.transform_8x8_mode;
4265
4266     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4267
4268     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
4269
4270     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4271     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4272                 down the code */
4273     if(h->slice_type_nos != FF_I_TYPE){
4274         if(s->mb_skip_run==-1)
4275             s->mb_skip_run= get_ue_golomb(&s->gb);
4276
4277         if (s->mb_skip_run--) {
4278             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4279                 if(s->mb_skip_run==0)
4280                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4281                 else
4282                     predict_field_decoding_flag(h);
4283             }
4284             decode_mb_skip(h);
4285             return 0;
4286         }
4287     }
4288     if(FRAME_MBAFF){
4289         if( (s->mb_y&1) == 0 )
4290             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4291     }
4292
4293     h->prev_mb_skipped= 0;
4294
4295     mb_type= get_ue_golomb(&s->gb);
4296     if(h->slice_type_nos == FF_B_TYPE){
4297         if(mb_type < 23){
4298             partition_count= b_mb_type_info[mb_type].partition_count;
4299             mb_type=         b_mb_type_info[mb_type].type;
4300         }else{
4301             mb_type -= 23;
4302             goto decode_intra_mb;
4303         }
4304     }else if(h->slice_type_nos == FF_P_TYPE){
4305         if(mb_type < 5){
4306             partition_count= p_mb_type_info[mb_type].partition_count;
4307             mb_type=         p_mb_type_info[mb_type].type;
4308         }else{
4309             mb_type -= 5;
4310             goto decode_intra_mb;
4311         }
4312     }else{
4313        assert(h->slice_type_nos == FF_I_TYPE);
4314         if(h->slice_type == FF_SI_TYPE && mb_type)
4315             mb_type--;
4316 decode_intra_mb:
4317         if(mb_type > 25){
4318             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4319             return -1;
4320         }
4321         partition_count=0;
4322         cbp= i_mb_type_info[mb_type].cbp;
4323         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4324         mb_type= i_mb_type_info[mb_type].type;
4325     }
4326
4327     if(MB_FIELD)
4328         mb_type |= MB_TYPE_INTERLACED;
4329
4330     h->slice_table[ mb_xy ]= h->slice_num;
4331
4332     if(IS_INTRA_PCM(mb_type)){
4333         unsigned int x;
4334
4335         // We assume these blocks are very rare so we do not optimize it.
4336         align_get_bits(&s->gb);
4337
4338         // The pixels are stored in the same order as levels in h->mb array.
4339         for(x=0; x < (CHROMA ? 384 : 256); x++){
4340             ((uint8_t*)h->mb)[x]= get_bits(&s->gb, 8);
4341         }
4342
4343         // In deblocking, the quantizer is 0
4344         s->current_picture.qscale_table[mb_xy]= 0;
4345         // All coeffs are present
4346         memset(h->non_zero_count[mb_xy], 16, 16);
4347
4348         s->current_picture.mb_type[mb_xy]= mb_type;
4349         return 0;
4350     }
4351
4352     if(MB_MBAFF){
4353         h->ref_count[0] <<= 1;
4354         h->ref_count[1] <<= 1;
4355     }
4356
4357     fill_caches(h, mb_type, 0);
4358
4359     //mb_pred
4360     if(IS_INTRA(mb_type)){
4361         int pred_mode;
4362 //            init_top_left_availability(h);
4363         if(IS_INTRA4x4(mb_type)){
4364             int i;
4365             int di = 1;
4366             if(dct8x8_allowed && get_bits1(&s->gb)){
4367                 mb_type |= MB_TYPE_8x8DCT;
4368                 di = 4;
4369             }
4370
4371 //                fill_intra4x4_pred_table(h);
4372             for(i=0; i<16; i+=di){
4373                 int mode= pred_intra_mode(h, i);
4374
4375                 if(!get_bits1(&s->gb)){
4376                     const int rem_mode= get_bits(&s->gb, 3);
4377                     mode = rem_mode + (rem_mode >= mode);
4378                 }
4379
4380                 if(di==4)
4381                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4382                 else
4383                     h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4384             }
4385             write_back_intra_pred_mode(h);
4386             if( check_intra4x4_pred_mode(h) < 0)
4387                 return -1;
4388         }else{
4389             h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4390             if(h->intra16x16_pred_mode < 0)
4391                 return -1;
4392         }
4393         if(CHROMA){
4394             pred_mode= check_intra_pred_mode(h, get_ue_golomb(&s->gb));
4395             if(pred_mode < 0)
4396                 return -1;
4397             h->chroma_pred_mode= pred_mode;
4398         }
4399     }else if(partition_count==4){
4400         int i, j, sub_partition_count[4], list, ref[2][4];
4401
4402         if(h->slice_type_nos == FF_B_TYPE){
4403             for(i=0; i<4; i++){
4404                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4405                 if(h->sub_mb_type[i] >=13){
4406                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4407                     return -1;
4408                 }
4409                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4410                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4411             }
4412             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4413                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4414                 pred_direct_motion(h, &mb_type);
4415                 h->ref_cache[0][scan8[4]] =
4416                 h->ref_cache[1][scan8[4]] =
4417                 h->ref_cache[0][scan8[12]] =
4418                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4419             }
4420         }else{
4421             assert(h->slice_type_nos == FF_P_TYPE); //FIXME SP correct ?
4422             for(i=0; i<4; i++){
4423                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4424                 if(h->sub_mb_type[i] >=4){
4425                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4426                     return -1;
4427                 }
4428                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4429                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4430             }
4431         }
4432
4433         for(list=0; list<h->list_count; list++){
4434             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4435             for(i=0; i<4; i++){
4436                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4437                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4438                     unsigned int tmp = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
4439                     if(tmp>=ref_count){
4440                         av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4441                         return -1;
4442                     }
4443                     ref[list][i]= tmp;
4444                 }else{
4445                  //FIXME
4446                     ref[list][i] = -1;
4447                 }
4448             }
4449         }
4450
4451         if(dct8x8_allowed)
4452             dct8x8_allowed = get_dct8x8_allowed(h);
4453
4454         for(list=0; list<h->list_count; list++){
4455             for(i=0; i<4; i++){
4456                 if(IS_DIRECT(h->sub_mb_type[i])) {
4457                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4458                     continue;
4459                 }
4460                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4461                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4462
4463                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4464                     const int sub_mb_type= h->sub_mb_type[i];
4465                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4466                     for(j=0; j<sub_partition_count[i]; j++){
4467                         int mx, my;
4468                         const int index= 4*i + block_width*j;
4469                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4470                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4471                         mx += get_se_golomb(&s->gb);
4472                         my += get_se_golomb(&s->gb);
4473                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4474
4475                         if(IS_SUB_8X8(sub_mb_type)){
4476                             mv_cache[ 1 ][0]=
4477                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4478                             mv_cache[ 1 ][1]=
4479                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4480                         }else if(IS_SUB_8X4(sub_mb_type)){
4481                             mv_cache[ 1 ][0]= mx;
4482                             mv_cache[ 1 ][1]= my;
4483                         }else if(IS_SUB_4X8(sub_mb_type)){
4484                             mv_cache[ 8 ][0]= mx;
4485                             mv_cache[ 8 ][1]= my;
4486                         }
4487                         mv_cache[ 0 ][0]= mx;
4488                         mv_cache[ 0 ][1]= my;
4489                     }
4490                 }else{
4491                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4492                     p[0] = p[1]=
4493                     p[8] = p[9]= 0;
4494                 }
4495             }
4496         }
4497     }else if(IS_DIRECT(mb_type)){
4498         pred_direct_motion(h, &mb_type);
4499         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4500     }else{
4501         int list, mx, my, i;
4502          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4503         if(IS_16X16(mb_type)){
4504             for(list=0; list<h->list_count; list++){
4505                     unsigned int val;
4506                     if(IS_DIR(mb_type, 0, list)){
4507                         val= get_te0_golomb(&s->gb, h->ref_count[list]);
4508                         if(val >= h->ref_count[list]){
4509                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4510                             return -1;
4511                         }
4512                     }else
4513                         val= LIST_NOT_USED&0xFF;
4514                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4515             }
4516             for(list=0; list<h->list_count; list++){
4517                 unsigned int val;
4518                 if(IS_DIR(mb_type, 0, list)){
4519                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4520                     mx += get_se_golomb(&s->gb);
4521                     my += get_se_golomb(&s->gb);
4522                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4523
4524                     val= pack16to32(mx,my);
4525                 }else
4526                     val=0;
4527                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4528             }
4529         }
4530         else if(IS_16X8(mb_type)){
4531             for(list=0; list<h->list_count; list++){
4532                     for(i=0; i<2; i++){
4533                         unsigned int val;
4534                         if(IS_DIR(mb_type, i, list)){
4535                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4536                             if(val >= h->ref_count[list]){
4537                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4538                                 return -1;
4539                             }
4540                         }else
4541                             val= LIST_NOT_USED&0xFF;
4542                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4543                     }
4544             }
4545             for(list=0; list<h->list_count; list++){
4546                 for(i=0; i<2; i++){
4547                     unsigned int val;
4548                     if(IS_DIR(mb_type, i, list)){
4549                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4550                         mx += get_se_golomb(&s->gb);
4551                         my += get_se_golomb(&s->gb);
4552                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4553
4554                         val= pack16to32(mx,my);
4555                     }else
4556                         val=0;
4557                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4558                 }
4559             }
4560         }else{
4561             assert(IS_8X16(mb_type));
4562             for(list=0; list<h->list_count; list++){
4563                     for(i=0; i<2; i++){
4564                         unsigned int val;
4565                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4566                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4567                             if(val >= h->ref_count[list]){
4568                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4569                                 return -1;
4570                             }
4571                         }else
4572                             val= LIST_NOT_USED&0xFF;
4573                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4574                     }
4575             }
4576             for(list=0; list<h->list_count; list++){
4577                 for(i=0; i<2; i++){
4578                     unsigned int val;
4579                     if(IS_DIR(mb_type, i, list)){
4580                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4581                         mx += get_se_golomb(&s->gb);
4582                         my += get_se_golomb(&s->gb);
4583                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4584
4585                         val= pack16to32(mx,my);
4586                     }else
4587                         val=0;
4588                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4589                 }
4590             }
4591         }
4592     }
4593
4594     if(IS_INTER(mb_type))
4595         write_back_motion(h, mb_type);
4596
4597     if(!IS_INTRA16x16(mb_type)){
4598         cbp= get_ue_golomb(&s->gb);
4599         if(cbp > 47){
4600             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4601             return -1;
4602         }
4603
4604         if(CHROMA){
4605             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp[cbp];
4606             else                     cbp= golomb_to_inter_cbp   [cbp];
4607         }else{
4608             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp_gray[cbp];
4609             else                     cbp= golomb_to_inter_cbp_gray[cbp];
4610         }
4611     }
4612     h->cbp = cbp;
4613
4614     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4615         if(get_bits1(&s->gb)){
4616             mb_type |= MB_TYPE_8x8DCT;
4617             h->cbp_table[mb_xy]= cbp;
4618         }
4619     }
4620     s->current_picture.mb_type[mb_xy]= mb_type;
4621
4622     if(cbp || IS_INTRA16x16(mb_type)){
4623         int i8x8, i4x4, chroma_idx;
4624         int dquant;
4625         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4626         const uint8_t *scan, *scan8x8, *dc_scan;
4627
4628 //        fill_non_zero_count_cache(h);
4629
4630         if(IS_INTERLACED(mb_type)){
4631             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4632             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4633             dc_scan= luma_dc_field_scan;
4634         }else{
4635             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4636             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4637             dc_scan= luma_dc_zigzag_scan;
4638         }
4639
4640         dquant= get_se_golomb(&s->gb);
4641
4642         if( dquant > 25 || dquant < -26 ){
4643             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4644             return -1;
4645         }
4646
4647         s->qscale += dquant;
4648         if(((unsigned)s->qscale) > 51){
4649             if(s->qscale<0) s->qscale+= 52;
4650             else            s->qscale-= 52;
4651         }
4652
4653         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4654         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4655         if(IS_INTRA16x16(mb_type)){
4656             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4657                 return -1; //FIXME continue if partitioned and other return -1 too
4658             }
4659
4660             assert((cbp&15) == 0 || (cbp&15) == 15);
4661
4662             if(cbp&15){
4663                 for(i8x8=0; i8x8<4; i8x8++){
4664                     for(i4x4=0; i4x4<4; i4x4++){
4665                         const int index= i4x4 + 4*i8x8;
4666                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4667                             return -1;
4668                         }
4669                     }
4670                 }
4671             }else{
4672                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4673             }
4674         }else{
4675             for(i8x8=0; i8x8<4; i8x8++){
4676                 if(cbp & (1<<i8x8)){
4677                     if(IS_8x8DCT(mb_type)){
4678                         DCTELEM *buf = &h->mb[64*i8x8];
4679                         uint8_t *nnz;
4680                         for(i4x4=0; i4x4<4; i4x4++){
4681                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4682                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4683                                 return -1;
4684                         }
4685                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4686                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4687                     }else{
4688                         for(i4x4=0; i4x4<4; i4x4++){
4689                             const int index= i4x4 + 4*i8x8;
4690
4691                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4692                                 return -1;
4693                             }
4694                         }
4695                     }
4696                 }else{
4697                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4698                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4699                 }
4700             }
4701         }
4702
4703         if(cbp&0x30){
4704             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4705                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4706                     return -1;
4707                 }
4708         }
4709
4710         if(cbp&0x20){
4711             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4712                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4713                 for(i4x4=0; i4x4<4; i4x4++){
4714                     const int index= 16 + 4*chroma_idx + i4x4;
4715                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4716                         return -1;
4717                     }
4718                 }
4719             }
4720         }else{
4721             uint8_t * const nnz= &h->non_zero_count_cache[0];
4722             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4723             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4724         }
4725     }else{
4726         uint8_t * const nnz= &h->non_zero_count_cache[0];
4727         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4728         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4729         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4730     }
4731     s->current_picture.qscale_table[mb_xy]= s->qscale;
4732     write_back_non_zero_count(h);
4733
4734     if(MB_MBAFF){
4735         h->ref_count[0] >>= 1;
4736         h->ref_count[1] >>= 1;
4737     }
4738
4739     return 0;
4740 }
4741
4742 static int decode_cabac_field_decoding_flag(H264Context *h) {
4743     MpegEncContext * const s = &h->s;
4744     const int mb_x = s->mb_x;
4745     const int mb_y = s->mb_y & ~1;
4746     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4747     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4748
4749     unsigned int ctx = 0;
4750
4751     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
4752         ctx += 1;
4753     }
4754     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
4755         ctx += 1;
4756     }
4757
4758     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
4759 }
4760
4761 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4762     uint8_t *state= &h->cabac_state[ctx_base];
4763     int mb_type;
4764
4765     if(intra_slice){
4766         MpegEncContext * const s = &h->s;
4767         const int mba_xy = h->left_mb_xy[0];
4768         const int mbb_xy = h->top_mb_xy;
4769         int ctx=0;
4770         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
4771             ctx++;
4772         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
4773             ctx++;
4774         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
4775             return 0;   /* I4x4 */
4776         state += 2;
4777     }else{
4778         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
4779             return 0;   /* I4x4 */
4780     }
4781
4782     if( get_cabac_terminate( &h->cabac ) )
4783         return 25;  /* PCM */
4784
4785     mb_type = 1; /* I16x16 */
4786     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
4787     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
4788         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
4789     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
4790     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
4791     return mb_type;
4792 }
4793
4794 static int decode_cabac_mb_type( H264Context *h ) {
4795     MpegEncContext * const s = &h->s;
4796
4797     if( h->slice_type_nos == FF_I_TYPE ) {
4798         return decode_cabac_intra_mb_type(h, 3, 1);
4799     } else if( h->slice_type_nos == FF_P_TYPE ) {
4800         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
4801             /* P-type */
4802             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
4803                 /* P_L0_D16x16, P_8x8 */
4804                 return 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
4805             } else {
4806                 /* P_L0_D8x16, P_L0_D16x8 */
4807                 return 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
4808             }
4809         } else {
4810             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
4811         }
4812     } else {
4813         const int mba_xy = h->left_mb_xy[0];
4814         const int mbb_xy = h->top_mb_xy;
4815         int ctx = 0;
4816         int bits;
4817         assert(h->slice_type_nos == FF_B_TYPE);
4818
4819         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
4820             ctx++;
4821         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
4822             ctx++;
4823
4824         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
4825             return 0; /* B_Direct_16x16 */
4826
4827         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
4828             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
4829         }
4830
4831         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
4832         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
4833         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
4834         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4835         if( bits < 8 )
4836             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
4837         else if( bits == 13 ) {
4838             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
4839         } else if( bits == 14 )
4840             return 11; /* B_L1_L0_8x16 */
4841         else if( bits == 15 )
4842             return 22; /* B_8x8 */
4843
4844         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4845         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
4846     }
4847 }
4848
4849 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
4850     MpegEncContext * const s = &h->s;
4851     int mba_xy, mbb_xy;
4852     int ctx = 0;
4853
4854     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
4855         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
4856         mba_xy = mb_xy - 1;
4857         if( (mb_y&1)
4858             && h->slice_table[mba_xy] == h->slice_num
4859             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
4860             mba_xy += s->mb_stride;
4861         if( MB_FIELD ){
4862             mbb_xy = mb_xy - s->mb_stride;
4863             if( !(mb_y&1)
4864                 && h->slice_table[mbb_xy] == h->slice_num
4865                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
4866                 mbb_xy -= s->mb_stride;
4867         }else
4868             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
4869     }else{
4870         int mb_xy = h->mb_xy;
4871         mba_xy = mb_xy - 1;
4872         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
4873     }
4874
4875     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
4876         ctx++;
4877     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
4878         ctx++;
4879
4880     if( h->slice_type_nos == FF_B_TYPE )
4881         ctx += 13;
4882     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
4883 }
4884
4885 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
4886     int mode = 0;
4887
4888     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
4889         return pred_mode;
4890
4891     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
4892     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
4893     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
4894
4895     if( mode >= pred_mode )
4896         return mode + 1;
4897     else
4898         return mode;
4899 }
4900
4901 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
4902     const int mba_xy = h->left_mb_xy[0];
4903     const int mbb_xy = h->top_mb_xy;
4904
4905     int ctx = 0;
4906
4907     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
4908     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
4909         ctx++;
4910
4911     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
4912         ctx++;
4913
4914     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
4915         return 0;
4916
4917     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4918         return 1;
4919     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4920         return 2;
4921     else
4922         return 3;
4923 }
4924
4925 static int decode_cabac_mb_cbp_luma( H264Context *h) {
4926     int cbp_b, cbp_a, ctx, cbp = 0;
4927
4928     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
4929     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
4930
4931     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
4932     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
4933     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
4934     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
4935     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
4936     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
4937     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
4938     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
4939     return cbp;
4940 }
4941 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
4942     int ctx;
4943     int cbp_a, cbp_b;
4944
4945     cbp_a = (h->left_cbp>>4)&0x03;
4946     cbp_b = (h-> top_cbp>>4)&0x03;
4947
4948     ctx = 0;
4949     if( cbp_a > 0 ) ctx++;
4950     if( cbp_b > 0 ) ctx += 2;
4951     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
4952         return 0;
4953
4954     ctx = 4;
4955     if( cbp_a == 2 ) ctx++;
4956     if( cbp_b == 2 ) ctx += 2;
4957     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
4958 }
4959 static int decode_cabac_mb_dqp( H264Context *h) {
4960     int   ctx= h->last_qscale_diff != 0;
4961     int   val = 0;
4962
4963     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
4964         ctx= 2+(ctx>>1);
4965         val++;
4966         if(val > 102) //prevent infinite loop
4967             return INT_MIN;
4968     }
4969
4970     if( val&0x01 )
4971         return   (val + 1)>>1 ;
4972     else
4973         return -((val + 1)>>1);
4974 }
4975 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
4976     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
4977         return 0;   /* 8x8 */
4978     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
4979         return 1;   /* 8x4 */
4980     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
4981         return 2;   /* 4x8 */
4982     return 3;       /* 4x4 */
4983 }
4984 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
4985     int type;
4986     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
4987         return 0;   /* B_Direct_8x8 */
4988     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
4989         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
4990     type = 3;
4991     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
4992         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
4993             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
4994         type += 4;
4995     }
4996     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
4997     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
4998     return type;
4999 }
5000
5001 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5002     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5003 }
5004
5005 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5006     int refa = h->ref_cache[list][scan8[n] - 1];
5007     int refb = h->ref_cache[list][scan8[n] - 8];
5008     int ref  = 0;
5009     int ctx  = 0;
5010
5011     if( h->slice_type_nos == FF_B_TYPE) {
5012         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5013             ctx++;
5014         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5015             ctx += 2;
5016     } else {
5017         if( refa > 0 )
5018             ctx++;
5019         if( refb > 0 )
5020             ctx += 2;
5021     }
5022
5023     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5024         ref++;
5025         if( ctx < 4 )
5026             ctx = 4;
5027         else
5028             ctx = 5;
5029         if(ref >= 32 /*h->ref_list[list]*/){
5030             return -1;
5031         }
5032     }
5033     return ref;
5034 }
5035
5036 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5037     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5038                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5039     int ctxbase = (l == 0) ? 40 : 47;
5040     int mvd;
5041     int ctx = (amvd>2) + (amvd>32);
5042
5043     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5044         return 0;
5045
5046     mvd= 1;
5047     ctx= 3;
5048     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5049         mvd++;
5050         if( ctx < 6 )
5051             ctx++;
5052     }
5053
5054     if( mvd >= 9 ) {
5055         int k = 3;
5056         while( get_cabac_bypass( &h->cabac ) ) {
5057             mvd += 1 << k;
5058             k++;
5059             if(k>24){
5060                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5061                 return INT_MIN;
5062             }
5063         }
5064         while( k-- ) {
5065             if( get_cabac_bypass( &h->cabac ) )
5066                 mvd += 1 << k;
5067         }
5068     }
5069     return get_cabac_bypass_sign( &h->cabac, -mvd );
5070 }
5071
5072 static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
5073     int nza, nzb;
5074     int ctx = 0;
5075
5076     if( is_dc ) {
5077         if( cat == 0 ) {
5078             nza = h->left_cbp&0x100;
5079             nzb = h-> top_cbp&0x100;
5080         } else {
5081             nza = (h->left_cbp>>(6+idx))&0x01;
5082             nzb = (h-> top_cbp>>(6+idx))&0x01;
5083         }
5084     } else {
5085         assert(cat == 1 || cat == 2 || cat == 4);
5086         nza = h->non_zero_count_cache[scan8[idx] - 1];
5087         nzb = h->non_zero_count_cache[scan8[idx] - 8];
5088     }
5089
5090     if( nza > 0 )
5091         ctx++;
5092
5093     if( nzb > 0 )
5094         ctx += 2;
5095
5096     return ctx + 4 * cat;
5097 }
5098
5099 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8[63]) = {
5100     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5101     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5102     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5103     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5104 };
5105
5106 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
5107     static const int significant_coeff_flag_offset[2][6] = {
5108       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5109       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5110     };
5111     static const int last_coeff_flag_offset[2][6] = {
5112       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5113       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5114     };
5115     static const int coeff_abs_level_m1_offset[6] = {
5116         227+0, 227+10, 227+20, 227+30, 227+39, 426
5117     };
5118     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5119       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5120         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5121         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5122        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5123       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5124         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5125         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5126         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5127     };
5128     /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
5129      * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
5130      * map node ctx => cabac ctx for level=1 */
5131     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
5132     /* map node ctx => cabac ctx for level>1 */
5133     static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
5134     static const uint8_t coeff_abs_level_transition[2][8] = {
5135     /* update node ctx after decoding a level=1 */
5136         { 1, 2, 3, 3, 4, 5, 6, 7 },
5137     /* update node ctx after decoding a level>1 */
5138         { 4, 4, 4, 4, 5, 6, 7, 7 }
5139     };
5140
5141     int index[64];
5142
5143     int av_unused last;
5144     int coeff_count = 0;
5145     int node_ctx = 0;
5146
5147     uint8_t *significant_coeff_ctx_base;
5148     uint8_t *last_coeff_ctx_base;
5149     uint8_t *abs_level_m1_ctx_base;
5150
5151 #ifndef ARCH_X86
5152 #define CABAC_ON_STACK
5153 #endif
5154 #ifdef CABAC_ON_STACK
5155 #define CC &cc
5156     CABACContext cc;
5157     cc.range     = h->cabac.range;
5158     cc.low       = h->cabac.low;
5159     cc.bytestream= h->cabac.bytestream;
5160 #else
5161 #define CC &h->cabac
5162 #endif
5163
5164
5165     /* cat: 0-> DC 16x16  n = 0
5166      *      1-> AC 16x16  n = luma4x4idx
5167      *      2-> Luma4x4   n = luma4x4idx
5168      *      3-> DC Chroma n = iCbCr
5169      *      4-> AC Chroma n = 16 + 4 * iCbCr + chroma4x4idx
5170      *      5-> Luma8x8   n = 4 * luma8x8idx
5171      */
5172
5173     /* read coded block flag */
5174     if( is_dc || cat != 5 ) {
5175         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
5176             if( !is_dc )
5177                 h->non_zero_count_cache[scan8[n]] = 0;
5178
5179 #ifdef CABAC_ON_STACK
5180             h->cabac.range     = cc.range     ;
5181             h->cabac.low       = cc.low       ;
5182             h->cabac.bytestream= cc.bytestream;
5183 #endif
5184             return;
5185         }
5186     }
5187
5188     significant_coeff_ctx_base = h->cabac_state
5189         + significant_coeff_flag_offset[MB_FIELD][cat];
5190     last_coeff_ctx_base = h->cabac_state
5191         + last_coeff_flag_offset[MB_FIELD][cat];
5192     abs_level_m1_ctx_base = h->cabac_state
5193         + coeff_abs_level_m1_offset[cat];
5194
5195     if( !is_dc && cat == 5 ) {
5196 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5197         for(last= 0; last < coefs; last++) { \
5198             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5199             if( get_cabac( CC, sig_ctx )) { \
5200                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5201                 index[coeff_count++] = last; \
5202                 if( get_cabac( CC, last_ctx ) ) { \
5203                     last= max_coeff; \
5204                     break; \
5205                 } \
5206             } \
5207         }\
5208         if( last == max_coeff -1 ) {\
5209             index[coeff_count++] = last;\
5210         }
5211         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5212 #if defined(ARCH_X86) && defined(HAVE_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
5213         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5214     } else {
5215         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5216 #else
5217         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5218     } else {
5219         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5220 #endif
5221     }
5222     assert(coeff_count > 0);
5223
5224     if( is_dc ) {
5225         if( cat == 0 )
5226             h->cbp_table[h->mb_xy] |= 0x100;
5227         else
5228             h->cbp_table[h->mb_xy] |= 0x40 << n;
5229     } else {
5230         if( cat == 5 )
5231             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5232         else {
5233             assert( cat == 1 || cat == 2 || cat == 4 );
5234             h->non_zero_count_cache[scan8[n]] = coeff_count;
5235         }
5236     }
5237
5238     do {
5239         uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
5240
5241         int j= scantable[index[--coeff_count]];
5242
5243         if( get_cabac( CC, ctx ) == 0 ) {
5244             node_ctx = coeff_abs_level_transition[0][node_ctx];
5245             if( is_dc ) {
5246                 block[j] = get_cabac_bypass_sign( CC, -1);
5247             }else{
5248                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5249             }
5250         } else {
5251             int coeff_abs = 2;
5252             ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
5253             node_ctx = coeff_abs_level_transition[1][node_ctx];
5254
5255             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5256                 coeff_abs++;
5257             }
5258
5259             if( coeff_abs >= 15 ) {
5260                 int j = 0;
5261                 while( get_cabac_bypass( CC ) ) {
5262                     j++;
5263                 }
5264
5265                 coeff_abs=1;
5266                 while( j-- ) {
5267                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5268                 }
5269                 coeff_abs+= 14;
5270             }
5271
5272             if( is_dc ) {
5273                 block[j] = get_cabac_bypass_sign( CC, -coeff_abs );
5274             }else{
5275                 block[j] = (get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32) >> 6;
5276             }
5277         }
5278     } while( coeff_count );
5279 #ifdef CABAC_ON_STACK
5280             h->cabac.range     = cc.range     ;
5281             h->cabac.low       = cc.low       ;
5282             h->cabac.bytestream= cc.bytestream;
5283 #endif
5284
5285 }
5286
5287 #ifndef CONFIG_SMALL
5288 static void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5289     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 1);
5290 }
5291
5292 static void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5293     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
5294 }
5295 #endif
5296
5297 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5298 #ifdef CONFIG_SMALL
5299     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, cat == 0 || cat == 3);
5300 #else
5301     if( cat == 0 || cat == 3 ) decode_cabac_residual_dc(h, block, cat, n, scantable, qmul, max_coeff);
5302     else decode_cabac_residual_nondc(h, block, cat, n, scantable, qmul, max_coeff);
5303 #endif
5304 }
5305
5306 static inline void compute_mb_neighbors(H264Context *h)
5307 {
5308     MpegEncContext * const s = &h->s;
5309     const int mb_xy  = h->mb_xy;
5310     h->top_mb_xy     = mb_xy - s->mb_stride;
5311     h->left_mb_xy[0] = mb_xy - 1;
5312     if(FRAME_MBAFF){
5313         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5314         const int top_pair_xy      = pair_xy     - s->mb_stride;
5315         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5316         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5317         const int curr_mb_frame_flag = !MB_FIELD;
5318         const int bottom = (s->mb_y & 1);
5319         if (bottom
5320                 ? !curr_mb_frame_flag // bottom macroblock
5321                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
5322                 ) {
5323             h->top_mb_xy -= s->mb_stride;
5324         }
5325         if (left_mb_frame_flag != curr_mb_frame_flag) {
5326             h->left_mb_xy[0] = pair_xy - 1;
5327         }
5328     } else if (FIELD_PICTURE) {
5329         h->top_mb_xy -= s->mb_stride;
5330     }
5331     return;
5332 }
5333
5334 /**
5335  * decodes a macroblock
5336  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5337  */
5338 static int decode_mb_cabac(H264Context *h) {
5339     MpegEncContext * const s = &h->s;
5340     int mb_xy;
5341     int mb_type, partition_count, cbp = 0;
5342     int dct8x8_allowed= h->pps.transform_8x8_mode;
5343
5344     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5345
5346     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
5347
5348     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5349     if( h->slice_type_nos != FF_I_TYPE ) {
5350         int skip;
5351         /* a skipped mb needs the aff flag from the following mb */
5352         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5353             predict_field_decoding_flag(h);
5354         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5355             skip = h->next_mb_skipped;
5356         else
5357             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5358         /* read skip flags */
5359         if( skip ) {
5360             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5361                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5362                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5363                 if(h->next_mb_skipped)
5364                     predict_field_decoding_flag(h);
5365                 else
5366                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5367             }
5368
5369             decode_mb_skip(h);
5370
5371             h->cbp_table[mb_xy] = 0;
5372             h->chroma_pred_mode_table[mb_xy] = 0;
5373             h->last_qscale_diff = 0;
5374
5375             return 0;
5376
5377         }
5378     }
5379     if(FRAME_MBAFF){
5380         if( (s->mb_y&1) == 0 )
5381             h->mb_mbaff =
5382             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5383     }
5384
5385     h->prev_mb_skipped = 0;
5386
5387     compute_mb_neighbors(h);
5388     mb_type = decode_cabac_mb_type( h );
5389     assert(mb_type >= 0);
5390
5391     if( h->slice_type_nos == FF_B_TYPE ) {
5392         if( mb_type < 23 ){
5393             partition_count= b_mb_type_info[mb_type].partition_count;
5394             mb_type=         b_mb_type_info[mb_type].type;
5395         }else{
5396             mb_type -= 23;
5397             goto decode_intra_mb;
5398         }
5399     } else if( h->slice_type_nos == FF_P_TYPE ) {
5400         if( mb_type < 5) {
5401             partition_count= p_mb_type_info[mb_type].partition_count;
5402             mb_type=         p_mb_type_info[mb_type].type;
5403         } else {
5404             mb_type -= 5;
5405             goto decode_intra_mb;
5406         }
5407     } else {
5408         if(h->slice_type == FF_SI_TYPE && mb_type)
5409             mb_type--;
5410         assert(h->slice_type_nos == FF_I_TYPE);
5411 decode_intra_mb:
5412         partition_count = 0;
5413         cbp= i_mb_type_info[mb_type].cbp;
5414         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5415         mb_type= i_mb_type_info[mb_type].type;
5416     }
5417     if(MB_FIELD)
5418         mb_type |= MB_TYPE_INTERLACED;
5419
5420     h->slice_table[ mb_xy ]= h->slice_num;
5421
5422     if(IS_INTRA_PCM(mb_type)) {
5423         const uint8_t *ptr;
5424
5425         // We assume these blocks are very rare so we do not optimize it.
5426         // FIXME The two following lines get the bitstream position in the cabac
5427         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5428         ptr= h->cabac.bytestream;
5429         if(h->cabac.low&0x1) ptr--;
5430         if(CABAC_BITS==16){
5431             if(h->cabac.low&0x1FF) ptr--;
5432         }
5433
5434         // The pixels are stored in the same order as levels in h->mb array.
5435         memcpy(h->mb, ptr, 256); ptr+=256;
5436         if(CHROMA){
5437             memcpy(h->mb+128, ptr, 128); ptr+=128;
5438         }
5439
5440         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5441
5442         // All blocks are present
5443         h->cbp_table[mb_xy] = 0x1ef;
5444         h->chroma_pred_mode_table[mb_xy] = 0;
5445         // In deblocking, the quantizer is 0
5446         s->current_picture.qscale_table[mb_xy]= 0;
5447         // All coeffs are present
5448         memset(h->non_zero_count[mb_xy], 16, 16);
5449         s->current_picture.mb_type[mb_xy]= mb_type;
5450         h->last_qscale_diff = 0;
5451         return 0;
5452     }
5453
5454     if(MB_MBAFF){
5455         h->ref_count[0] <<= 1;
5456         h->ref_count[1] <<= 1;
5457     }
5458
5459     fill_caches(h, mb_type, 0);
5460
5461     if( IS_INTRA( mb_type ) ) {
5462         int i, pred_mode;
5463         if( IS_INTRA4x4( mb_type ) ) {
5464             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5465                 mb_type |= MB_TYPE_8x8DCT;
5466                 for( i = 0; i < 16; i+=4 ) {
5467                     int pred = pred_intra_mode( h, i );
5468                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5469                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5470                 }
5471             } else {
5472                 for( i = 0; i < 16; i++ ) {
5473                     int pred = pred_intra_mode( h, i );
5474                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5475
5476                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5477                 }
5478             }
5479             write_back_intra_pred_mode(h);
5480             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5481         } else {
5482             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5483             if( h->intra16x16_pred_mode < 0 ) return -1;
5484         }
5485         if(CHROMA){
5486             h->chroma_pred_mode_table[mb_xy] =
5487             pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5488
5489             pred_mode= check_intra_pred_mode( h, pred_mode );
5490             if( pred_mode < 0 ) return -1;
5491             h->chroma_pred_mode= pred_mode;
5492         }
5493     } else if( partition_count == 4 ) {
5494         int i, j, sub_partition_count[4], list, ref[2][4];
5495
5496         if( h->slice_type_nos == FF_B_TYPE ) {
5497             for( i = 0; i < 4; i++ ) {
5498                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5499                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5500                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5501             }
5502             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5503                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5504                 pred_direct_motion(h, &mb_type);
5505                 h->ref_cache[0][scan8[4]] =
5506                 h->ref_cache[1][scan8[4]] =
5507                 h->ref_cache[0][scan8[12]] =
5508                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5509                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5510                     for( i = 0; i < 4; i++ )
5511                         if( IS_DIRECT(h->sub_mb_type[i]) )
5512                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5513                 }
5514             }
5515         } else {
5516             for( i = 0; i < 4; i++ ) {
5517                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5518                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5519                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5520             }
5521         }
5522
5523         for( list = 0; list < h->list_count; list++ ) {
5524                 for( i = 0; i < 4; i++ ) {
5525                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5526                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5527                         if( h->ref_count[list] > 1 ){
5528                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5529                             if(ref[list][i] >= (unsigned)h->ref_count[list]){
5530                                 av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref[list][i], h->ref_count[list]);
5531                                 return -1;
5532                             }
5533                         }else
5534                             ref[list][i] = 0;
5535                     } else {
5536                         ref[list][i] = -1;
5537                     }
5538                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5539                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5540                 }
5541         }
5542
5543         if(dct8x8_allowed)
5544             dct8x8_allowed = get_dct8x8_allowed(h);
5545
5546         for(list=0; list<h->list_count; list++){
5547             for(i=0; i<4; i++){
5548                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5549                 if(IS_DIRECT(h->sub_mb_type[i])){
5550                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5551                     continue;
5552                 }
5553
5554                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5555                     const int sub_mb_type= h->sub_mb_type[i];
5556                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5557                     for(j=0; j<sub_partition_count[i]; j++){
5558                         int mpx, mpy;
5559                         int mx, my;
5560                         const int index= 4*i + block_width*j;
5561                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5562                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5563                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5564
5565                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5566                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5567                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5568
5569                         if(IS_SUB_8X8(sub_mb_type)){
5570                             mv_cache[ 1 ][0]=
5571                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5572                             mv_cache[ 1 ][1]=
5573                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5574
5575                             mvd_cache[ 1 ][0]=
5576                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5577                             mvd_cache[ 1 ][1]=
5578                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5579                         }else if(IS_SUB_8X4(sub_mb_type)){
5580                             mv_cache[ 1 ][0]= mx;
5581                             mv_cache[ 1 ][1]= my;
5582
5583                             mvd_cache[ 1 ][0]= mx - mpx;
5584                             mvd_cache[ 1 ][1]= my - mpy;
5585                         }else if(IS_SUB_4X8(sub_mb_type)){
5586                             mv_cache[ 8 ][0]= mx;
5587                             mv_cache[ 8 ][1]= my;
5588
5589                             mvd_cache[ 8 ][0]= mx - mpx;
5590                             mvd_cache[ 8 ][1]= my - mpy;
5591                         }
5592                         mv_cache[ 0 ][0]= mx;
5593                         mv_cache[ 0 ][1]= my;
5594
5595                         mvd_cache[ 0 ][0]= mx - mpx;
5596                         mvd_cache[ 0 ][1]= my - mpy;
5597                     }
5598                 }else{
5599                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5600                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5601                     p[0] = p[1] = p[8] = p[9] = 0;
5602                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5603                 }
5604             }
5605         }
5606     } else if( IS_DIRECT(mb_type) ) {
5607         pred_direct_motion(h, &mb_type);
5608         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5609         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5610         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5611     } else {
5612         int list, mx, my, i, mpx, mpy;
5613         if(IS_16X16(mb_type)){
5614             for(list=0; list<h->list_count; list++){
5615                 if(IS_DIR(mb_type, 0, list)){
5616                     int ref;
5617                     if(h->ref_count[list] > 1){
5618                         ref= decode_cabac_mb_ref(h, list, 0);
5619                         if(ref >= (unsigned)h->ref_count[list]){
5620                             av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5621                             return -1;
5622                         }
5623                     }else
5624                         ref=0;
5625                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5626                 }else
5627                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5628             }
5629             for(list=0; list<h->list_count; list++){
5630                 if(IS_DIR(mb_type, 0, list)){
5631                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5632
5633                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5634                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5635                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5636
5637                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5638                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5639                 }else
5640                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5641             }
5642         }
5643         else if(IS_16X8(mb_type)){
5644             for(list=0; list<h->list_count; list++){
5645                     for(i=0; i<2; i++){
5646                         if(IS_DIR(mb_type, i, list)){
5647                             int ref;
5648                             if(h->ref_count[list] > 1){
5649                                 ref= decode_cabac_mb_ref( h, list, 8*i );
5650                                 if(ref >= (unsigned)h->ref_count[list]){
5651                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5652                                     return -1;
5653                                 }
5654                             }else
5655                                 ref=0;
5656                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5657                         }else
5658                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5659                     }
5660             }
5661             for(list=0; list<h->list_count; list++){
5662                 for(i=0; i<2; i++){
5663                     if(IS_DIR(mb_type, i, list)){
5664                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5665                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5666                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5667                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5668
5669                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5670                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5671                     }else{
5672                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5673                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5674                     }
5675                 }
5676             }
5677         }else{
5678             assert(IS_8X16(mb_type));
5679             for(list=0; list<h->list_count; list++){
5680                     for(i=0; i<2; i++){
5681                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5682                             int ref;
5683                             if(h->ref_count[list] > 1){
5684                                 ref= decode_cabac_mb_ref( h, list, 4*i );
5685                                 if(ref >= (unsigned)h->ref_count[list]){
5686                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5687                                     return -1;
5688                                 }
5689                             }else
5690                                 ref=0;
5691                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5692                         }else
5693                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5694                     }
5695             }
5696             for(list=0; list<h->list_count; list++){
5697                 for(i=0; i<2; i++){
5698                     if(IS_DIR(mb_type, i, list)){
5699                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5700                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5701                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5702
5703                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5704                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5705                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5706                     }else{
5707                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5708                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5709                     }
5710                 }
5711             }
5712         }
5713     }
5714
5715    if( IS_INTER( mb_type ) ) {
5716         h->chroma_pred_mode_table[mb_xy] = 0;
5717         write_back_motion( h, mb_type );
5718    }
5719
5720     if( !IS_INTRA16x16( mb_type ) ) {
5721         cbp  = decode_cabac_mb_cbp_luma( h );
5722         if(CHROMA)
5723             cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5724     }
5725
5726     h->cbp_table[mb_xy] = h->cbp = cbp;
5727
5728     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5729         if( decode_cabac_mb_transform_size( h ) )
5730             mb_type |= MB_TYPE_8x8DCT;
5731     }
5732     s->current_picture.mb_type[mb_xy]= mb_type;
5733
5734     if( cbp || IS_INTRA16x16( mb_type ) ) {
5735         const uint8_t *scan, *scan8x8, *dc_scan;
5736         const uint32_t *qmul;
5737         int dqp;
5738
5739         if(IS_INTERLACED(mb_type)){
5740             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5741             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5742             dc_scan= luma_dc_field_scan;
5743         }else{
5744             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5745             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5746             dc_scan= luma_dc_zigzag_scan;
5747         }
5748
5749         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5750         if( dqp == INT_MIN ){
5751             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5752             return -1;
5753         }
5754         s->qscale += dqp;
5755         if(((unsigned)s->qscale) > 51){
5756             if(s->qscale<0) s->qscale+= 52;
5757             else            s->qscale-= 52;
5758         }
5759         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5760         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5761
5762         if( IS_INTRA16x16( mb_type ) ) {
5763             int i;
5764             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5765             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5766
5767             if( cbp&15 ) {
5768                 qmul = h->dequant4_coeff[0][s->qscale];
5769                 for( i = 0; i < 16; i++ ) {
5770                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5771                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
5772                 }
5773             } else {
5774                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5775             }
5776         } else {
5777             int i8x8, i4x4;
5778             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5779                 if( cbp & (1<<i8x8) ) {
5780                     if( IS_8x8DCT(mb_type) ) {
5781                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
5782                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
5783                     } else {
5784                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
5785                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
5786                             const int index = 4*i8x8 + i4x4;
5787                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
5788 //START_TIMER
5789                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
5790 //STOP_TIMER("decode_residual")
5791                         }
5792                     }
5793                 } else {
5794                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5795                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5796                 }
5797             }
5798         }
5799
5800         if( cbp&0x30 ){
5801             int c;
5802             for( c = 0; c < 2; c++ ) {
5803                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
5804                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
5805             }
5806         }
5807
5808         if( cbp&0x20 ) {
5809             int c, i;
5810             for( c = 0; c < 2; c++ ) {
5811                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
5812                 for( i = 0; i < 4; i++ ) {
5813                     const int index = 16 + 4 * c + i;
5814                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
5815                     decode_cabac_residual(h, h->mb + 16*index, 4, index, scan + 1, qmul, 15);
5816                 }
5817             }
5818         } else {
5819             uint8_t * const nnz= &h->non_zero_count_cache[0];
5820             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5821             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5822         }
5823     } else {
5824         uint8_t * const nnz= &h->non_zero_count_cache[0];
5825         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5826         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5827         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5828         h->last_qscale_diff = 0;
5829     }
5830
5831     s->current_picture.qscale_table[mb_xy]= s->qscale;
5832     write_back_non_zero_count(h);
5833
5834     if(MB_MBAFF){
5835         h->ref_count[0] >>= 1;
5836         h->ref_count[1] >>= 1;
5837     }
5838
5839     return 0;
5840 }
5841
5842
5843 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5844     int i, d;
5845     const int index_a = qp + h->slice_alpha_c0_offset;
5846     const int alpha = (alpha_table+52)[index_a];
5847     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5848
5849     if( bS[0] < 4 ) {
5850         int8_t tc[4];
5851         for(i=0; i<4; i++)
5852             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
5853         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
5854     } else {
5855         h->s.dsp.h264_h_loop_filter_luma_intra(pix, stride, alpha, beta);
5856     }
5857 }
5858 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5859     int i;
5860     const int index_a = qp + h->slice_alpha_c0_offset;
5861     const int alpha = (alpha_table+52)[index_a];
5862     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5863
5864     if( bS[0] < 4 ) {
5865         int8_t tc[4];
5866         for(i=0; i<4; i++)
5867             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
5868         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
5869     } else {
5870         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
5871     }
5872 }
5873
5874 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
5875     int i;
5876     for( i = 0; i < 16; i++, pix += stride) {
5877         int index_a;
5878         int alpha;
5879         int beta;
5880
5881         int qp_index;
5882         int bS_index = (i >> 1);
5883         if (!MB_FIELD) {
5884             bS_index &= ~1;
5885             bS_index |= (i & 1);
5886         }
5887
5888         if( bS[bS_index] == 0 ) {
5889             continue;
5890         }
5891
5892         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
5893         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
5894         alpha = (alpha_table+52)[index_a];
5895         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
5896
5897         if( bS[bS_index] < 4 ) {
5898             const int tc0 = (tc0_table+52)[index_a][bS[bS_index] - 1];
5899             const int p0 = pix[-1];
5900             const int p1 = pix[-2];
5901             const int p2 = pix[-3];
5902             const int q0 = pix[0];
5903             const int q1 = pix[1];
5904             const int q2 = pix[2];
5905
5906             if( FFABS( p0 - q0 ) < alpha &&
5907                 FFABS( p1 - p0 ) < beta &&
5908                 FFABS( q1 - q0 ) < beta ) {
5909                 int tc = tc0;
5910                 int i_delta;
5911
5912                 if( FFABS( p2 - p0 ) < beta ) {
5913                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
5914                     tc++;
5915                 }
5916                 if( FFABS( q2 - q0 ) < beta ) {
5917                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
5918                     tc++;
5919                 }
5920
5921                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
5922                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
5923                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
5924                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
5925             }
5926         }else{
5927             const int p0 = pix[-1];
5928             const int p1 = pix[-2];
5929             const int p2 = pix[-3];
5930
5931             const int q0 = pix[0];
5932             const int q1 = pix[1];
5933             const int q2 = pix[2];
5934
5935             if( FFABS( p0 - q0 ) < alpha &&
5936                 FFABS( p1 - p0 ) < beta &&
5937                 FFABS( q1 - q0 ) < beta ) {
5938
5939                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
5940                     if( FFABS( p2 - p0 ) < beta)
5941                     {
5942                         const int p3 = pix[-4];
5943                         /* p0', p1', p2' */
5944                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
5945                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
5946                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
5947                     } else {
5948                         /* p0' */
5949                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5950                     }
5951                     if( FFABS( q2 - q0 ) < beta)
5952                     {
5953                         const int q3 = pix[3];
5954                         /* q0', q1', q2' */
5955                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
5956                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
5957                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
5958                     } else {
5959                         /* q0' */
5960                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5961                     }
5962                 }else{
5963                     /* p0', q0' */
5964                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5965                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5966                 }
5967                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
5968             }
5969         }
5970     }
5971 }
5972 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
5973     int i;
5974     for( i = 0; i < 8; i++, pix += stride) {
5975         int index_a;
5976         int alpha;
5977         int beta;
5978
5979         int qp_index;
5980         int bS_index = i;
5981
5982         if( bS[bS_index] == 0 ) {
5983             continue;
5984         }
5985
5986         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
5987         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
5988         alpha = (alpha_table+52)[index_a];
5989         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
5990
5991         if( bS[bS_index] < 4 ) {
5992             const int tc = (tc0_table+52)[index_a][bS[bS_index] - 1] + 1;
5993             const int p0 = pix[-1];
5994             const int p1 = pix[-2];
5995             const int q0 = pix[0];
5996             const int q1 = pix[1];
5997
5998             if( FFABS( p0 - q0 ) < alpha &&
5999                 FFABS( p1 - p0 ) < beta &&
6000                 FFABS( q1 - q0 ) < beta ) {
6001                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6002
6003                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6004                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6005                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6006             }
6007         }else{
6008             const int p0 = pix[-1];
6009             const int p1 = pix[-2];
6010             const int q0 = pix[0];
6011             const int q1 = pix[1];
6012
6013             if( FFABS( p0 - q0 ) < alpha &&
6014                 FFABS( p1 - p0 ) < beta &&
6015                 FFABS( q1 - q0 ) < beta ) {
6016
6017                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6018                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6019                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6020             }
6021         }
6022     }
6023 }
6024
6025 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6026     int i, d;
6027     const int index_a = qp + h->slice_alpha_c0_offset;
6028     const int alpha = (alpha_table+52)[index_a];
6029     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6030     const int pix_next  = stride;
6031
6032     if( bS[0] < 4 ) {
6033         int8_t tc[4];
6034         for(i=0; i<4; i++)
6035             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6036         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6037     } else {
6038         h->s.dsp.h264_v_loop_filter_luma_intra(pix, stride, alpha, beta);
6039     }
6040 }
6041
6042 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6043     int i;
6044     const int index_a = qp + h->slice_alpha_c0_offset;
6045     const int alpha = (alpha_table+52)[index_a];
6046     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6047
6048     if( bS[0] < 4 ) {
6049         int8_t tc[4];
6050         for(i=0; i<4; i++)
6051             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6052         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6053     } else {
6054         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6055     }
6056 }
6057
6058 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6059     MpegEncContext * const s = &h->s;
6060     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6061     int mb_xy, mb_type;
6062     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6063
6064     mb_xy = h->mb_xy;
6065
6066     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6067         !(s->flags2 & CODEC_FLAG2_FAST) || //FIXME filter_mb_fast is broken, thus hasto be, but should not under CODEC_FLAG2_FAST
6068        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6069                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6070         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6071         return;
6072     }
6073     assert(!FRAME_MBAFF);
6074
6075     mb_type = s->current_picture.mb_type[mb_xy];
6076     qp = s->current_picture.qscale_table[mb_xy];
6077     qp0 = s->current_picture.qscale_table[mb_xy-1];
6078     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6079     qpc = get_chroma_qp( h, 0, qp );
6080     qpc0 = get_chroma_qp( h, 0, qp0 );
6081     qpc1 = get_chroma_qp( h, 0, qp1 );
6082     qp0 = (qp + qp0 + 1) >> 1;
6083     qp1 = (qp + qp1 + 1) >> 1;
6084     qpc0 = (qpc + qpc0 + 1) >> 1;
6085     qpc1 = (qpc + qpc1 + 1) >> 1;
6086     qp_thresh = 15 - h->slice_alpha_c0_offset;
6087     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6088        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6089         return;
6090
6091     if( IS_INTRA(mb_type) ) {
6092         int16_t bS4[4] = {4,4,4,4};
6093         int16_t bS3[4] = {3,3,3,3};
6094         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6095         if( IS_8x8DCT(mb_type) ) {
6096             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6097             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6098             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6099             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6100         } else {
6101             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6102             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6103             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6104             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6105             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6106             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6107             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6108             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6109         }
6110         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6111         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6112         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6113         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6114         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6115         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6116         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6117         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6118         return;
6119     } else {
6120         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6121         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6122         int edges;
6123         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6124             edges = 4;
6125             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6126         } else {
6127             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6128                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6129             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6130                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6131                              ? 3 : 0;
6132             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6133             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6134             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6135                                               (h->slice_type_nos == FF_B_TYPE), edges, step, mask_edge0, mask_edge1, FIELD_PICTURE);
6136         }
6137         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6138             bSv[0][0] = 0x0004000400040004ULL;
6139         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6140             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6141
6142 #define FILTER(hv,dir,edge)\
6143         if(bSv[dir][edge]) {\
6144             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6145             if(!(edge&1)) {\
6146                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6147                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6148             }\
6149         }
6150         if( edges == 1 ) {
6151             FILTER(v,0,0);
6152             FILTER(h,1,0);
6153         } else if( IS_8x8DCT(mb_type) ) {
6154             FILTER(v,0,0);
6155             FILTER(v,0,2);
6156             FILTER(h,1,0);
6157             FILTER(h,1,2);
6158         } else {
6159             FILTER(v,0,0);
6160             FILTER(v,0,1);
6161             FILTER(v,0,2);
6162             FILTER(v,0,3);
6163             FILTER(h,1,0);
6164             FILTER(h,1,1);
6165             FILTER(h,1,2);
6166             FILTER(h,1,3);
6167         }
6168 #undef FILTER
6169     }
6170 }
6171
6172
6173 static void av_always_inline filter_mb_dir(H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int mb_xy, int mb_type, int mvy_limit, int first_vertical_edge_done, int dir) {
6174     MpegEncContext * const s = &h->s;
6175     int edge;
6176     const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6177     const int mbm_type = s->current_picture.mb_type[mbm_xy];
6178     int (*ref2frm) [64] = h->ref2frm[ h->slice_num          &(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6179     int (*ref2frmm)[64] = h->ref2frm[ h->slice_table[mbm_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6180     int start = h->slice_table[mbm_xy] == 0xFFFF ? 1 : 0;
6181
6182     const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6183                               == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6184     // how often to recheck mv-based bS when iterating between edges
6185     const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6186                           (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6187     // how often to recheck mv-based bS when iterating along each edge
6188     const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6189
6190     if (first_vertical_edge_done) {
6191         start = 1;
6192     }
6193
6194     if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6195         start = 1;
6196
6197     if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6198         && !IS_INTERLACED(mb_type)
6199         && IS_INTERLACED(mbm_type)
6200         ) {
6201         // This is a special case in the norm where the filtering must
6202         // be done twice (one each of the field) even if we are in a
6203         // frame macroblock.
6204         //
6205         static const int nnz_idx[4] = {4,5,6,3};
6206         unsigned int tmp_linesize   = 2 *   linesize;
6207         unsigned int tmp_uvlinesize = 2 * uvlinesize;
6208         int mbn_xy = mb_xy - 2 * s->mb_stride;
6209         int qp;
6210         int i, j;
6211         int16_t bS[4];
6212
6213         for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6214             if( IS_INTRA(mb_type) ||
6215                 IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6216                 bS[0] = bS[1] = bS[2] = bS[3] = 3;
6217             } else {
6218                 const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6219                 for( i = 0; i < 4; i++ ) {
6220                     if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6221                         mbn_nnz[nnz_idx[i]] != 0 )
6222                         bS[i] = 2;
6223                     else
6224                         bS[i] = 1;
6225                 }
6226             }
6227             // Do not use s->qscale as luma quantizer because it has not the same
6228             // value in IPCM macroblocks.
6229             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6230             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6231             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6232             filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6233             filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6234                               ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6235             filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6236                               ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6237         }
6238
6239         start = 1;
6240     }
6241
6242     /* Calculate bS */
6243     for( edge = start; edge < edges; edge++ ) {
6244         /* mbn_xy: neighbor macroblock */
6245         const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6246         const int mbn_type = s->current_picture.mb_type[mbn_xy];
6247         int (*ref2frmn)[64] = edge > 0 ? ref2frm : ref2frmm;
6248         int16_t bS[4];
6249         int qp;
6250
6251         if( (edge&1) && IS_8x8DCT(mb_type) )
6252             continue;
6253
6254         if( IS_INTRA(mb_type) ||
6255             IS_INTRA(mbn_type) ) {
6256             int value;
6257             if (edge == 0) {
6258                 if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6259                     || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6260                 ) {
6261                     value = 4;
6262                 } else {
6263                     value = 3;
6264                 }
6265             } else {
6266                 value = 3;
6267             }
6268             bS[0] = bS[1] = bS[2] = bS[3] = value;
6269         } else {
6270             int i, l;
6271             int mv_done;
6272
6273             if( edge & mask_edge ) {
6274                 bS[0] = bS[1] = bS[2] = bS[3] = 0;
6275                 mv_done = 1;
6276             }
6277             else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6278                 bS[0] = bS[1] = bS[2] = bS[3] = 1;
6279                 mv_done = 1;
6280             }
6281             else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6282                 int b_idx= 8 + 4 + edge * (dir ? 8:1);
6283                 int bn_idx= b_idx - (dir ? 8:1);
6284                 int v = 0;
6285
6286                 for( l = 0; !v && l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6287                     v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6288                          FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6289                          FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6290                 }
6291
6292                 if(h->slice_type_nos == FF_B_TYPE && v){
6293                     v=0;
6294                     for( l = 0; !v && l < 2; l++ ) {
6295                         int ln= 1-l;
6296                         v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6297                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6298                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit;
6299                     }
6300                 }
6301
6302                 bS[0] = bS[1] = bS[2] = bS[3] = v;
6303                 mv_done = 1;
6304             }
6305             else
6306                 mv_done = 0;
6307
6308             for( i = 0; i < 4; i++ ) {
6309                 int x = dir == 0 ? edge : i;
6310                 int y = dir == 0 ? i    : edge;
6311                 int b_idx= 8 + 4 + x + 8*y;
6312                 int bn_idx= b_idx - (dir ? 8:1);
6313
6314                 if( h->non_zero_count_cache[b_idx] |
6315                     h->non_zero_count_cache[bn_idx] ) {
6316                     bS[i] = 2;
6317                 }
6318                 else if(!mv_done)
6319                 {
6320                     bS[i] = 0;
6321                     for( l = 0; l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6322                         if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6323                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6324                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6325                             bS[i] = 1;
6326                             break;
6327                         }
6328                     }
6329
6330                     if(h->slice_type_nos == FF_B_TYPE && bS[i]){
6331                         bS[i] = 0;
6332                         for( l = 0; l < 2; l++ ) {
6333                             int ln= 1-l;
6334                             if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6335                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6336                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) {
6337                                 bS[i] = 1;
6338                                 break;
6339                             }
6340                         }
6341                     }
6342                 }
6343             }
6344
6345             if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6346                 continue;
6347         }
6348
6349         /* Filter edge */
6350         // Do not use s->qscale as luma quantizer because it has not the same
6351         // value in IPCM macroblocks.
6352         qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6353         //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6354         tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6355         { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6356         if( dir == 0 ) {
6357             filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6358             if( (edge&1) == 0 ) {
6359                 filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6360                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6361                 filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6362                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6363             }
6364         } else {
6365             filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6366             if( (edge&1) == 0 ) {
6367                 filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6368                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6369                 filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6370                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6371             }
6372         }
6373     }
6374 }
6375
6376 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6377     MpegEncContext * const s = &h->s;
6378     const int mb_xy= mb_x + mb_y*s->mb_stride;
6379     const int mb_type = s->current_picture.mb_type[mb_xy];
6380     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6381     int first_vertical_edge_done = 0;
6382     int dir;
6383
6384     //for sufficiently low qp, filtering wouldn't do anything
6385     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6386     if(!FRAME_MBAFF){
6387         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
6388         int qp = s->current_picture.qscale_table[mb_xy];
6389         if(qp <= qp_thresh
6390            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6391            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6392             return;
6393         }
6394     }
6395
6396     // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
6397     if(!h->pps.cabac && h->pps.transform_8x8_mode){
6398         int top_type, left_type[2];
6399         top_type     = s->current_picture.mb_type[h->top_mb_xy]    ;
6400         left_type[0] = s->current_picture.mb_type[h->left_mb_xy[0]];
6401         left_type[1] = s->current_picture.mb_type[h->left_mb_xy[1]];
6402
6403         if(IS_8x8DCT(top_type)){
6404             h->non_zero_count_cache[4+8*0]=
6405             h->non_zero_count_cache[5+8*0]= h->cbp_table[h->top_mb_xy] & 4;
6406             h->non_zero_count_cache[6+8*0]=
6407             h->non_zero_count_cache[7+8*0]= h->cbp_table[h->top_mb_xy] & 8;
6408         }
6409         if(IS_8x8DCT(left_type[0])){
6410             h->non_zero_count_cache[3+8*1]=
6411             h->non_zero_count_cache[3+8*2]= h->cbp_table[h->left_mb_xy[0]]&2; //FIXME check MBAFF
6412         }
6413         if(IS_8x8DCT(left_type[1])){
6414             h->non_zero_count_cache[3+8*3]=
6415             h->non_zero_count_cache[3+8*4]= h->cbp_table[h->left_mb_xy[1]]&8; //FIXME check MBAFF
6416         }
6417
6418         if(IS_8x8DCT(mb_type)){
6419             h->non_zero_count_cache[scan8[0   ]]= h->non_zero_count_cache[scan8[1   ]]=
6420             h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp & 1;
6421
6422             h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
6423             h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp & 2;
6424
6425             h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
6426             h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp & 4;
6427
6428             h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
6429             h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp & 8;
6430         }
6431     }
6432
6433     if (FRAME_MBAFF
6434             // left mb is in picture
6435             && h->slice_table[mb_xy-1] != 0xFFFF
6436             // and current and left pair do not have the same interlaced type
6437             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6438             // and left mb is in the same slice if deblocking_filter == 2
6439             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6440         /* First vertical edge is different in MBAFF frames
6441          * There are 8 different bS to compute and 2 different Qp
6442          */
6443         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6444         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6445         int16_t bS[8];
6446         int qp[2];
6447         int bqp[2];
6448         int rqp[2];
6449         int mb_qp, mbn0_qp, mbn1_qp;
6450         int i;
6451         first_vertical_edge_done = 1;
6452
6453         if( IS_INTRA(mb_type) )
6454             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6455         else {
6456             for( i = 0; i < 8; i++ ) {
6457                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6458
6459                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6460                     bS[i] = 4;
6461                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6462                          ((!h->pps.cabac && IS_8x8DCT(s->current_picture.mb_type[mbn_xy])) ?
6463                             (h->cbp_table[mbn_xy] & ((MB_FIELD ? (i&2) : (mb_y&1)) ? 8 : 2))
6464                                                                        :
6465                             h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2]))
6466                     bS[i] = 2;
6467                 else
6468                     bS[i] = 1;
6469             }
6470         }
6471
6472         mb_qp = s->current_picture.qscale_table[mb_xy];
6473         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6474         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6475         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6476         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6477                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6478         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6479                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6480         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6481         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6482                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6483         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6484                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6485
6486         /* Filter edge */
6487         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6488         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6489         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6490         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6491         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6492     }
6493
6494 #ifdef CONFIG_SMALL
6495     for( dir = 0; dir < 2; dir++ )
6496         filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, dir ? 0 : first_vertical_edge_done, dir);
6497 #else
6498     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, first_vertical_edge_done, 0);
6499     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, 0, 1);
6500 #endif
6501 }
6502
6503 static int decode_slice(struct AVCodecContext *avctx, void *arg){
6504     H264Context *h = *(void**)arg;
6505     MpegEncContext * const s = &h->s;
6506     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6507
6508     s->mb_skip_run= -1;
6509
6510     h->is_complex = FRAME_MBAFF || s->picture_structure != PICT_FRAME || s->codec_id != CODEC_ID_H264 ||
6511                     (ENABLE_GRAY && (s->flags&CODEC_FLAG_GRAY)) || (ENABLE_H264_ENCODER && s->encoding);
6512
6513     if( h->pps.cabac ) {
6514         int i;
6515
6516         /* realign */
6517         align_get_bits( &s->gb );
6518
6519         /* init cabac */
6520         ff_init_cabac_states( &h->cabac);
6521         ff_init_cabac_decoder( &h->cabac,
6522                                s->gb.buffer + get_bits_count(&s->gb)/8,
6523                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6524         /* calculate pre-state */
6525         for( i= 0; i < 460; i++ ) {
6526             int pre;
6527             if( h->slice_type_nos == FF_I_TYPE )
6528                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6529             else
6530                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6531
6532             if( pre <= 63 )
6533                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6534             else
6535                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6536         }
6537
6538         for(;;){
6539 //START_TIMER
6540             int ret = decode_mb_cabac(h);
6541             int eos;
6542 //STOP_TIMER("decode_mb_cabac")
6543
6544             if(ret>=0) hl_decode_mb(h);
6545
6546             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6547                 s->mb_y++;
6548
6549                 if(ret>=0) ret = decode_mb_cabac(h);
6550
6551                 if(ret>=0) hl_decode_mb(h);
6552                 s->mb_y--;
6553             }
6554             eos = get_cabac_terminate( &h->cabac );
6555
6556             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6557                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6558                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6559                 return -1;
6560             }
6561
6562             if( ++s->mb_x >= s->mb_width ) {
6563                 s->mb_x = 0;
6564                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6565                 ++s->mb_y;
6566                 if(FIELD_OR_MBAFF_PICTURE) {
6567                     ++s->mb_y;
6568                 }
6569             }
6570
6571             if( eos || s->mb_y >= s->mb_height ) {
6572                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6573                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6574                 return 0;
6575             }
6576         }
6577
6578     } else {
6579         for(;;){
6580             int ret = decode_mb_cavlc(h);
6581
6582             if(ret>=0) hl_decode_mb(h);
6583
6584             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6585                 s->mb_y++;
6586                 ret = decode_mb_cavlc(h);
6587
6588                 if(ret>=0) hl_decode_mb(h);
6589                 s->mb_y--;
6590             }
6591
6592             if(ret<0){
6593                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6594                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6595
6596                 return -1;
6597             }
6598
6599             if(++s->mb_x >= s->mb_width){
6600                 s->mb_x=0;
6601                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6602                 ++s->mb_y;
6603                 if(FIELD_OR_MBAFF_PICTURE) {
6604                     ++s->mb_y;
6605                 }
6606                 if(s->mb_y >= s->mb_height){
6607                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6608
6609                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6610                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6611
6612                         return 0;
6613                     }else{
6614                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6615
6616                         return -1;
6617                     }
6618                 }
6619             }
6620
6621             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6622                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6623                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6624                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6625
6626                     return 0;
6627                 }else{
6628                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6629
6630                     return -1;
6631                 }
6632             }
6633         }
6634     }
6635
6636 #if 0
6637     for(;s->mb_y < s->mb_height; s->mb_y++){
6638         for(;s->mb_x < s->mb_width; s->mb_x++){
6639             int ret= decode_mb(h);
6640
6641             hl_decode_mb(h);
6642
6643             if(ret<0){
6644                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6645                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6646
6647                 return -1;
6648             }
6649
6650             if(++s->mb_x >= s->mb_width){
6651                 s->mb_x=0;
6652                 if(++s->mb_y >= s->mb_height){
6653                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6654                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6655
6656                         return 0;
6657                     }else{
6658                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6659
6660                         return -1;
6661                     }
6662                 }
6663             }
6664
6665             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6666                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6667                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6668
6669                     return 0;
6670                 }else{
6671                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6672
6673                     return -1;
6674                 }
6675             }
6676         }
6677         s->mb_x=0;
6678         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6679     }
6680 #endif
6681     return -1; //not reached
6682 }
6683
6684 static int decode_picture_timing(H264Context *h){
6685     MpegEncContext * const s = &h->s;
6686     if(h->sps.nal_hrd_parameters_present_flag || h->sps.vcl_hrd_parameters_present_flag){
6687         skip_bits(&s->gb, h->sps.cpb_removal_delay_length); /* cpb_removal_delay */
6688         skip_bits(&s->gb, h->sps.dpb_output_delay_length);  /* dpb_output_delay */
6689     }
6690     if(h->sps.pic_struct_present_flag){
6691         unsigned int i, num_clock_ts;
6692         h->sei_pic_struct = get_bits(&s->gb, 4);
6693
6694         if (h->sei_pic_struct > SEI_PIC_STRUCT_FRAME_TRIPLING)
6695             return -1;
6696
6697         num_clock_ts = sei_num_clock_ts_table[h->sei_pic_struct];
6698
6699         for (i = 0 ; i < num_clock_ts ; i++){
6700             if(get_bits(&s->gb, 1)){                  /* clock_timestamp_flag */
6701                 unsigned int full_timestamp_flag;
6702                 skip_bits(&s->gb, 2);                 /* ct_type */
6703                 skip_bits(&s->gb, 1);                 /* nuit_field_based_flag */
6704                 skip_bits(&s->gb, 5);                 /* counting_type */
6705                 full_timestamp_flag = get_bits(&s->gb, 1);
6706                 skip_bits(&s->gb, 1);                 /* discontinuity_flag */
6707                 skip_bits(&s->gb, 1);                 /* cnt_dropped_flag */
6708                 skip_bits(&s->gb, 8);                 /* n_frames */
6709                 if(full_timestamp_flag){
6710                     skip_bits(&s->gb, 6);             /* seconds_value 0..59 */
6711                     skip_bits(&s->gb, 6);             /* minutes_value 0..59 */
6712                     skip_bits(&s->gb, 5);             /* hours_value 0..23 */
6713                 }else{
6714                     if(get_bits(&s->gb, 1)){          /* seconds_flag */
6715                         skip_bits(&s->gb, 6);         /* seconds_value range 0..59 */
6716                         if(get_bits(&s->gb, 1)){      /* minutes_flag */
6717                             skip_bits(&s->gb, 6);     /* minutes_value 0..59 */
6718                             if(get_bits(&s->gb, 1))   /* hours_flag */
6719                                 skip_bits(&s->gb, 5); /* hours_value 0..23 */
6720                         }
6721                     }
6722                 }
6723                 if(h->sps.time_offset_length > 0)
6724                     skip_bits(&s->gb, h->sps.time_offset_length); /* time_offset */
6725             }
6726         }
6727     }
6728     return 0;
6729 }
6730
6731 static int decode_unregistered_user_data(H264Context *h, int size){
6732     MpegEncContext * const s = &h->s;
6733     uint8_t user_data[16+256];
6734     int e, build, i;
6735
6736     if(size<16)
6737         return -1;
6738
6739     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6740         user_data[i]= get_bits(&s->gb, 8);
6741     }
6742
6743     user_data[i]= 0;
6744     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6745     if(e==1 && build>=0)
6746         h->x264_build= build;
6747
6748     if(s->avctx->debug & FF_DEBUG_BUGS)
6749         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6750
6751     for(; i<size; i++)
6752         skip_bits(&s->gb, 8);
6753
6754     return 0;
6755 }
6756
6757 static int decode_sei(H264Context *h){
6758     MpegEncContext * const s = &h->s;
6759
6760     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6761         int size, type;
6762
6763         type=0;
6764         do{
6765             type+= show_bits(&s->gb, 8);
6766         }while(get_bits(&s->gb, 8) == 255);
6767
6768         size=0;
6769         do{
6770             size+= show_bits(&s->gb, 8);
6771         }while(get_bits(&s->gb, 8) == 255);
6772
6773         switch(type){
6774         case 1: // Picture timing SEI
6775             if(decode_picture_timing(h) < 0)
6776                 return -1;
6777             break;
6778         case 5:
6779             if(decode_unregistered_user_data(h, size) < 0)
6780                 return -1;
6781             break;
6782         default:
6783             skip_bits(&s->gb, 8*size);
6784         }
6785
6786         //FIXME check bits here
6787         align_get_bits(&s->gb);
6788     }
6789
6790     return 0;
6791 }
6792
6793 static inline int decode_hrd_parameters(H264Context *h, SPS *sps){
6794     MpegEncContext * const s = &h->s;
6795     int cpb_count, i;
6796     cpb_count = get_ue_golomb(&s->gb) + 1;
6797
6798     if(cpb_count > 32U){
6799         av_log(h->s.avctx, AV_LOG_ERROR, "cpb_count %d invalid\n", cpb_count);
6800         return -1;
6801     }
6802
6803     get_bits(&s->gb, 4); /* bit_rate_scale */
6804     get_bits(&s->gb, 4); /* cpb_size_scale */
6805     for(i=0; i<cpb_count; i++){
6806         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
6807         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
6808         get_bits1(&s->gb);     /* cbr_flag */
6809     }
6810     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
6811     sps->cpb_removal_delay_length = get_bits(&s->gb, 5) + 1;
6812     sps->dpb_output_delay_length = get_bits(&s->gb, 5) + 1;
6813     sps->time_offset_length = get_bits(&s->gb, 5);
6814     return 0;
6815 }
6816
6817 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
6818     MpegEncContext * const s = &h->s;
6819     int aspect_ratio_info_present_flag;
6820     unsigned int aspect_ratio_idc;
6821
6822     aspect_ratio_info_present_flag= get_bits1(&s->gb);
6823
6824     if( aspect_ratio_info_present_flag ) {
6825         aspect_ratio_idc= get_bits(&s->gb, 8);
6826         if( aspect_ratio_idc == EXTENDED_SAR ) {
6827             sps->sar.num= get_bits(&s->gb, 16);
6828             sps->sar.den= get_bits(&s->gb, 16);
6829         }else if(aspect_ratio_idc < FF_ARRAY_ELEMS(pixel_aspect)){
6830             sps->sar=  pixel_aspect[aspect_ratio_idc];
6831         }else{
6832             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
6833             return -1;
6834         }
6835     }else{
6836         sps->sar.num=
6837         sps->sar.den= 0;
6838     }
6839 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
6840
6841     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
6842         get_bits1(&s->gb);      /* overscan_appropriate_flag */
6843     }
6844
6845     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
6846         get_bits(&s->gb, 3);    /* video_format */
6847         get_bits1(&s->gb);      /* video_full_range_flag */
6848         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
6849             get_bits(&s->gb, 8); /* colour_primaries */
6850             get_bits(&s->gb, 8); /* transfer_characteristics */
6851             get_bits(&s->gb, 8); /* matrix_coefficients */
6852         }
6853     }
6854
6855     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
6856         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
6857         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
6858     }
6859
6860     sps->timing_info_present_flag = get_bits1(&s->gb);
6861     if(sps->timing_info_present_flag){
6862         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
6863         sps->time_scale = get_bits_long(&s->gb, 32);
6864         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
6865     }
6866
6867     sps->nal_hrd_parameters_present_flag = get_bits1(&s->gb);
6868     if(sps->nal_hrd_parameters_present_flag)
6869         if(decode_hrd_parameters(h, sps) < 0)
6870             return -1;
6871     sps->vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
6872     if(sps->vcl_hrd_parameters_present_flag)
6873         if(decode_hrd_parameters(h, sps) < 0)
6874             return -1;
6875     if(sps->nal_hrd_parameters_present_flag || sps->vcl_hrd_parameters_present_flag)
6876         get_bits1(&s->gb);     /* low_delay_hrd_flag */
6877     sps->pic_struct_present_flag = get_bits1(&s->gb);
6878
6879     sps->bitstream_restriction_flag = get_bits1(&s->gb);
6880     if(sps->bitstream_restriction_flag){
6881         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
6882         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
6883         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
6884         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
6885         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
6886         sps->num_reorder_frames= get_ue_golomb(&s->gb);
6887         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
6888
6889         if(sps->num_reorder_frames > 16U /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
6890             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", sps->num_reorder_frames);
6891             return -1;
6892         }
6893     }
6894
6895     return 0;
6896 }
6897
6898 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
6899                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
6900     MpegEncContext * const s = &h->s;
6901     int i, last = 8, next = 8;
6902     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
6903     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
6904         memcpy(factors, fallback_list, size*sizeof(uint8_t));
6905     else
6906     for(i=0;i<size;i++){
6907         if(next)
6908             next = (last + get_se_golomb(&s->gb)) & 0xff;
6909         if(!i && !next){ /* matrix not written, we use the preset one */
6910             memcpy(factors, jvt_list, size*sizeof(uint8_t));
6911             break;
6912         }
6913         last = factors[scan[i]] = next ? next : last;
6914     }
6915 }
6916
6917 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
6918                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
6919     MpegEncContext * const s = &h->s;
6920     int fallback_sps = !is_sps && sps->scaling_matrix_present;
6921     const uint8_t *fallback[4] = {
6922         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
6923         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
6924         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
6925         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
6926     };
6927     if(get_bits1(&s->gb)){
6928         sps->scaling_matrix_present |= is_sps;
6929         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
6930         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
6931         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
6932         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
6933         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
6934         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
6935         if(is_sps || pps->transform_8x8_mode){
6936             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
6937             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
6938         }
6939     }
6940 }
6941
6942 static inline int decode_seq_parameter_set(H264Context *h){
6943     MpegEncContext * const s = &h->s;
6944     int profile_idc, level_idc;
6945     unsigned int sps_id;
6946     int i;
6947     SPS *sps;
6948
6949     profile_idc= get_bits(&s->gb, 8);
6950     get_bits1(&s->gb);   //constraint_set0_flag
6951     get_bits1(&s->gb);   //constraint_set1_flag
6952     get_bits1(&s->gb);   //constraint_set2_flag
6953     get_bits1(&s->gb);   //constraint_set3_flag
6954     get_bits(&s->gb, 4); // reserved
6955     level_idc= get_bits(&s->gb, 8);
6956     sps_id= get_ue_golomb(&s->gb);
6957
6958     if(sps_id >= MAX_SPS_COUNT) {
6959         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id (%d) out of range\n", sps_id);
6960         return -1;
6961     }
6962     sps= av_mallocz(sizeof(SPS));
6963     if(sps == NULL)
6964         return -1;
6965
6966     sps->profile_idc= profile_idc;
6967     sps->level_idc= level_idc;
6968
6969     memset(sps->scaling_matrix4, 16, sizeof(sps->scaling_matrix4));
6970     memset(sps->scaling_matrix8, 16, sizeof(sps->scaling_matrix8));
6971     sps->scaling_matrix_present = 0;
6972
6973     if(sps->profile_idc >= 100){ //high profile
6974         sps->chroma_format_idc= get_ue_golomb(&s->gb);
6975         if(sps->chroma_format_idc == 3)
6976             get_bits1(&s->gb);  //residual_color_transform_flag
6977         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
6978         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
6979         sps->transform_bypass = get_bits1(&s->gb);
6980         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
6981     }else{
6982         sps->chroma_format_idc= 1;
6983     }
6984
6985     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
6986     sps->poc_type= get_ue_golomb(&s->gb);
6987
6988     if(sps->poc_type == 0){ //FIXME #define
6989         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
6990     } else if(sps->poc_type == 1){//FIXME #define
6991         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
6992         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
6993         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
6994         sps->poc_cycle_length                = get_ue_golomb(&s->gb);
6995
6996         if((unsigned)sps->poc_cycle_length >= FF_ARRAY_ELEMS(sps->offset_for_ref_frame)){
6997             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", sps->poc_cycle_length);
6998             goto fail;
6999         }
7000
7001         for(i=0; i<sps->poc_cycle_length; i++)
7002             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7003     }else if(sps->poc_type != 2){
7004         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7005         goto fail;
7006     }
7007
7008     sps->ref_frame_count= get_ue_golomb(&s->gb);
7009     if(sps->ref_frame_count > MAX_PICTURE_COUNT-2 || sps->ref_frame_count >= 32U){
7010         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7011         goto fail;
7012     }
7013     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7014     sps->mb_width = get_ue_golomb(&s->gb) + 1;
7015     sps->mb_height= get_ue_golomb(&s->gb) + 1;
7016     if((unsigned)sps->mb_width >= INT_MAX/16 || (unsigned)sps->mb_height >= INT_MAX/16 ||
7017        avcodec_check_dimensions(NULL, 16*sps->mb_width, 16*sps->mb_height)){
7018         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7019         goto fail;
7020     }
7021
7022     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7023     if(!sps->frame_mbs_only_flag)
7024         sps->mb_aff= get_bits1(&s->gb);
7025     else
7026         sps->mb_aff= 0;
7027
7028     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7029
7030 #ifndef ALLOW_INTERLACE
7031     if(sps->mb_aff)
7032         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7033 #endif
7034     sps->crop= get_bits1(&s->gb);
7035     if(sps->crop){
7036         sps->crop_left  = get_ue_golomb(&s->gb);
7037         sps->crop_right = get_ue_golomb(&s->gb);
7038         sps->crop_top   = get_ue_golomb(&s->gb);
7039         sps->crop_bottom= get_ue_golomb(&s->gb);
7040         if(sps->crop_left || sps->crop_top){
7041             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7042         }
7043         if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !sps->frame_mbs_only_flag)){
7044             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
7045         }
7046     }else{
7047         sps->crop_left  =
7048         sps->crop_right =
7049         sps->crop_top   =
7050         sps->crop_bottom= 0;
7051     }
7052
7053     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7054     if( sps->vui_parameters_present_flag )
7055         decode_vui_parameters(h, sps);
7056
7057     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7058         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s %s\n",
7059                sps_id, sps->profile_idc, sps->level_idc,
7060                sps->poc_type,
7061                sps->ref_frame_count,
7062                sps->mb_width, sps->mb_height,
7063                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7064                sps->direct_8x8_inference_flag ? "8B8" : "",
7065                sps->crop_left, sps->crop_right,
7066                sps->crop_top, sps->crop_bottom,
7067                sps->vui_parameters_present_flag ? "VUI" : "",
7068                ((const char*[]){"Gray","420","422","444"})[sps->chroma_format_idc]
7069                );
7070     }
7071     av_free(h->sps_buffers[sps_id]);
7072     h->sps_buffers[sps_id]= sps;
7073     return 0;
7074 fail:
7075     av_free(sps);
7076     return -1;
7077 }
7078
7079 static void
7080 build_qp_table(PPS *pps, int t, int index)
7081 {
7082     int i;
7083     for(i = 0; i < 52; i++)
7084         pps->chroma_qp_table[t][i] = chroma_qp[av_clip(i + index, 0, 51)];
7085 }
7086
7087 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7088     MpegEncContext * const s = &h->s;
7089     unsigned int pps_id= get_ue_golomb(&s->gb);
7090     PPS *pps;
7091
7092     if(pps_id >= MAX_PPS_COUNT) {
7093         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id (%d) out of range\n", pps_id);
7094         return -1;
7095     }
7096
7097     pps= av_mallocz(sizeof(PPS));
7098     if(pps == NULL)
7099         return -1;
7100     pps->sps_id= get_ue_golomb(&s->gb);
7101     if((unsigned)pps->sps_id>=MAX_SPS_COUNT || h->sps_buffers[pps->sps_id] == NULL){
7102         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7103         goto fail;
7104     }
7105
7106     pps->cabac= get_bits1(&s->gb);
7107     pps->pic_order_present= get_bits1(&s->gb);
7108     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7109     if(pps->slice_group_count > 1 ){
7110         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7111         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7112         switch(pps->mb_slice_group_map_type){
7113         case 0:
7114 #if 0
7115 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7116 |    run_length[ i ]                                |1  |ue(v)   |
7117 #endif
7118             break;
7119         case 2:
7120 #if 0
7121 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7122 |{                                                  |   |        |
7123 |    top_left_mb[ i ]                               |1  |ue(v)   |
7124 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7125 |   }                                               |   |        |
7126 #endif
7127             break;
7128         case 3:
7129         case 4:
7130         case 5:
7131 #if 0
7132 |   slice_group_change_direction_flag               |1  |u(1)    |
7133 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7134 #endif
7135             break;
7136         case 6:
7137 #if 0
7138 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7139 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7140 |)                                                  |   |        |
7141 |    slice_group_id[ i ]                            |1  |u(v)    |
7142 #endif
7143             break;
7144         }
7145     }
7146     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7147     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7148     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7149         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7150         goto fail;
7151     }
7152
7153     pps->weighted_pred= get_bits1(&s->gb);
7154     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7155     pps->init_qp= get_se_golomb(&s->gb) + 26;
7156     pps->init_qs= get_se_golomb(&s->gb) + 26;
7157     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7158     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7159     pps->constrained_intra_pred= get_bits1(&s->gb);
7160     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7161
7162     pps->transform_8x8_mode= 0;
7163     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7164     memcpy(pps->scaling_matrix4, h->sps_buffers[pps->sps_id]->scaling_matrix4, sizeof(pps->scaling_matrix4));
7165     memcpy(pps->scaling_matrix8, h->sps_buffers[pps->sps_id]->scaling_matrix8, sizeof(pps->scaling_matrix8));
7166
7167     if(get_bits_count(&s->gb) < bit_length){
7168         pps->transform_8x8_mode= get_bits1(&s->gb);
7169         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7170         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7171     } else {
7172         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7173     }
7174
7175     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7176     build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7177     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1])
7178         h->pps.chroma_qp_diff= 1;
7179
7180     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7181         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7182                pps_id, pps->sps_id,
7183                pps->cabac ? "CABAC" : "CAVLC",
7184                pps->slice_group_count,
7185                pps->ref_count[0], pps->ref_count[1],
7186                pps->weighted_pred ? "weighted" : "",
7187                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7188                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7189                pps->constrained_intra_pred ? "CONSTR" : "",
7190                pps->redundant_pic_cnt_present ? "REDU" : "",
7191                pps->transform_8x8_mode ? "8x8DCT" : ""
7192                );
7193     }
7194
7195     av_free(h->pps_buffers[pps_id]);
7196     h->pps_buffers[pps_id]= pps;
7197     return 0;
7198 fail:
7199     av_free(pps);
7200     return -1;
7201 }
7202
7203 /**
7204  * Call decode_slice() for each context.
7205  *
7206  * @param h h264 master context
7207  * @param context_count number of contexts to execute
7208  */
7209 static void execute_decode_slices(H264Context *h, int context_count){
7210     MpegEncContext * const s = &h->s;
7211     AVCodecContext * const avctx= s->avctx;
7212     H264Context *hx;
7213     int i;
7214
7215     if(context_count == 1) {
7216         decode_slice(avctx, &h);
7217     } else {
7218         for(i = 1; i < context_count; i++) {
7219             hx = h->thread_context[i];
7220             hx->s.error_recognition = avctx->error_recognition;
7221             hx->s.error_count = 0;
7222         }
7223
7224         avctx->execute(avctx, (void *)decode_slice,
7225                        (void **)h->thread_context, NULL, context_count, sizeof(void*));
7226
7227         /* pull back stuff from slices to master context */
7228         hx = h->thread_context[context_count - 1];
7229         s->mb_x = hx->s.mb_x;
7230         s->mb_y = hx->s.mb_y;
7231         s->dropable = hx->s.dropable;
7232         s->picture_structure = hx->s.picture_structure;
7233         for(i = 1; i < context_count; i++)
7234             h->s.error_count += h->thread_context[i]->s.error_count;
7235     }
7236 }
7237
7238
7239 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7240     MpegEncContext * const s = &h->s;
7241     AVCodecContext * const avctx= s->avctx;
7242     int buf_index=0;
7243     H264Context *hx; ///< thread context
7244     int context_count = 0;
7245
7246     h->max_contexts = avctx->thread_count;
7247 #if 0
7248     int i;
7249     for(i=0; i<50; i++){
7250         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7251     }
7252 #endif
7253     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7254         h->current_slice = 0;
7255         if (!s->first_field)
7256             s->current_picture_ptr= NULL;
7257     }
7258
7259     for(;;){
7260         int consumed;
7261         int dst_length;
7262         int bit_length;
7263         const uint8_t *ptr;
7264         int i, nalsize = 0;
7265         int err;
7266
7267         if(h->is_avc) {
7268             if(buf_index >= buf_size) break;
7269             nalsize = 0;
7270             for(i = 0; i < h->nal_length_size; i++)
7271                 nalsize = (nalsize << 8) | buf[buf_index++];
7272             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7273                 if(nalsize == 1){
7274                     buf_index++;
7275                     continue;
7276                 }else{
7277                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7278                     break;
7279                 }
7280             }
7281         } else {
7282             // start code prefix search
7283             for(; buf_index + 3 < buf_size; buf_index++){
7284                 // This should always succeed in the first iteration.
7285                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7286                     break;
7287             }
7288
7289             if(buf_index+3 >= buf_size) break;
7290
7291             buf_index+=3;
7292         }
7293
7294         hx = h->thread_context[context_count];
7295
7296         ptr= decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7297         if (ptr==NULL || dst_length < 0){
7298             return -1;
7299         }
7300         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7301             dst_length--;
7302         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
7303
7304         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7305             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7306         }
7307
7308         if (h->is_avc && (nalsize != consumed)){
7309             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7310             consumed= nalsize;
7311         }
7312
7313         buf_index += consumed;
7314
7315         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7316            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7317             continue;
7318
7319       again:
7320         err = 0;
7321         switch(hx->nal_unit_type){
7322         case NAL_IDR_SLICE:
7323             if (h->nal_unit_type != NAL_IDR_SLICE) {
7324                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7325                 return -1;
7326             }
7327             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7328         case NAL_SLICE:
7329             init_get_bits(&hx->s.gb, ptr, bit_length);
7330             hx->intra_gb_ptr=
7331             hx->inter_gb_ptr= &hx->s.gb;
7332             hx->s.data_partitioning = 0;
7333
7334             if((err = decode_slice_header(hx, h)))
7335                break;
7336
7337             s->current_picture_ptr->key_frame|= (hx->nal_unit_type == NAL_IDR_SLICE);
7338             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7339                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7340                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7341                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7342                && avctx->skip_frame < AVDISCARD_ALL)
7343                 context_count++;
7344             break;
7345         case NAL_DPA:
7346             init_get_bits(&hx->s.gb, ptr, bit_length);
7347             hx->intra_gb_ptr=
7348             hx->inter_gb_ptr= NULL;
7349             hx->s.data_partitioning = 1;
7350
7351             err = decode_slice_header(hx, h);
7352             break;
7353         case NAL_DPB:
7354             init_get_bits(&hx->intra_gb, ptr, bit_length);
7355             hx->intra_gb_ptr= &hx->intra_gb;
7356             break;
7357         case NAL_DPC:
7358             init_get_bits(&hx->inter_gb, ptr, bit_length);
7359             hx->inter_gb_ptr= &hx->inter_gb;
7360
7361             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7362                && s->context_initialized
7363                && s->hurry_up < 5
7364                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7365                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7366                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7367                && avctx->skip_frame < AVDISCARD_ALL)
7368                 context_count++;
7369             break;
7370         case NAL_SEI:
7371             init_get_bits(&s->gb, ptr, bit_length);
7372             decode_sei(h);
7373             break;
7374         case NAL_SPS:
7375             init_get_bits(&s->gb, ptr, bit_length);
7376             decode_seq_parameter_set(h);
7377
7378             if(s->flags& CODEC_FLAG_LOW_DELAY)
7379                 s->low_delay=1;
7380
7381             if(avctx->has_b_frames < 2)
7382                 avctx->has_b_frames= !s->low_delay;
7383             break;
7384         case NAL_PPS:
7385             init_get_bits(&s->gb, ptr, bit_length);
7386
7387             decode_picture_parameter_set(h, bit_length);
7388
7389             break;
7390         case NAL_AUD:
7391         case NAL_END_SEQUENCE:
7392         case NAL_END_STREAM:
7393         case NAL_FILLER_DATA:
7394         case NAL_SPS_EXT:
7395         case NAL_AUXILIARY_SLICE:
7396             break;
7397         default:
7398             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7399         }
7400
7401         if(context_count == h->max_contexts) {
7402             execute_decode_slices(h, context_count);
7403             context_count = 0;
7404         }
7405
7406         if (err < 0)
7407             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7408         else if(err == 1) {
7409             /* Slice could not be decoded in parallel mode, copy down
7410              * NAL unit stuff to context 0 and restart. Note that
7411              * rbsp_buffer is not transferred, but since we no longer
7412              * run in parallel mode this should not be an issue. */
7413             h->nal_unit_type = hx->nal_unit_type;
7414             h->nal_ref_idc   = hx->nal_ref_idc;
7415             hx = h;
7416             goto again;
7417         }
7418     }
7419     if(context_count)
7420         execute_decode_slices(h, context_count);
7421     return buf_index;
7422 }
7423
7424 /**
7425  * returns the number of bytes consumed for building the current frame
7426  */
7427 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7428         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7429         if(pos+10>buf_size) pos=buf_size; // oops ;)
7430
7431         return pos;
7432 }
7433
7434 static int decode_frame(AVCodecContext *avctx,
7435                              void *data, int *data_size,
7436                              const uint8_t *buf, int buf_size)
7437 {
7438     H264Context *h = avctx->priv_data;
7439     MpegEncContext *s = &h->s;
7440     AVFrame *pict = data;
7441     int buf_index;
7442
7443     s->flags= avctx->flags;
7444     s->flags2= avctx->flags2;
7445
7446    /* end of stream, output what is still in the buffers */
7447     if (buf_size == 0) {
7448         Picture *out;
7449         int i, out_idx;
7450
7451 //FIXME factorize this with the output code below
7452         out = h->delayed_pic[0];
7453         out_idx = 0;
7454         for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7455             if(h->delayed_pic[i]->poc < out->poc){
7456                 out = h->delayed_pic[i];
7457                 out_idx = i;
7458             }
7459
7460         for(i=out_idx; h->delayed_pic[i]; i++)
7461             h->delayed_pic[i] = h->delayed_pic[i+1];
7462
7463         if(out){
7464             *data_size = sizeof(AVFrame);
7465             *pict= *(AVFrame*)out;
7466         }
7467
7468         return 0;
7469     }
7470
7471     if(h->is_avc && !h->got_avcC) {
7472         int i, cnt, nalsize;
7473         unsigned char *p = avctx->extradata;
7474         if(avctx->extradata_size < 7) {
7475             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7476             return -1;
7477         }
7478         if(*p != 1) {
7479             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7480             return -1;
7481         }
7482         /* sps and pps in the avcC always have length coded with 2 bytes,
7483            so put a fake nal_length_size = 2 while parsing them */
7484         h->nal_length_size = 2;
7485         // Decode sps from avcC
7486         cnt = *(p+5) & 0x1f; // Number of sps
7487         p += 6;
7488         for (i = 0; i < cnt; i++) {
7489             nalsize = AV_RB16(p) + 2;
7490             if(decode_nal_units(h, p, nalsize) < 0) {
7491                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7492                 return -1;
7493             }
7494             p += nalsize;
7495         }
7496         // Decode pps from avcC
7497         cnt = *(p++); // Number of pps
7498         for (i = 0; i < cnt; i++) {
7499             nalsize = AV_RB16(p) + 2;
7500             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7501                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7502                 return -1;
7503             }
7504             p += nalsize;
7505         }
7506         // Now store right nal length size, that will be use to parse all other nals
7507         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7508         // Do not reparse avcC
7509         h->got_avcC = 1;
7510     }
7511
7512     if(!h->got_avcC && !h->is_avc && s->avctx->extradata_size){
7513         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7514             return -1;
7515         h->got_avcC = 1;
7516     }
7517
7518     buf_index=decode_nal_units(h, buf, buf_size);
7519     if(buf_index < 0)
7520         return -1;
7521
7522     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7523         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7524         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7525         return -1;
7526     }
7527
7528     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7529         Picture *out = s->current_picture_ptr;
7530         Picture *cur = s->current_picture_ptr;
7531         int i, pics, cross_idr, out_of_order, out_idx;
7532
7533         s->mb_y= 0;
7534
7535         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7536         s->current_picture_ptr->pict_type= s->pict_type;
7537
7538         if(!s->dropable) {
7539             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7540             h->prev_poc_msb= h->poc_msb;
7541             h->prev_poc_lsb= h->poc_lsb;
7542         }
7543         h->prev_frame_num_offset= h->frame_num_offset;
7544         h->prev_frame_num= h->frame_num;
7545
7546         /*
7547          * FIXME: Error handling code does not seem to support interlaced
7548          * when slices span multiple rows
7549          * The ff_er_add_slice calls don't work right for bottom
7550          * fields; they cause massive erroneous error concealing
7551          * Error marking covers both fields (top and bottom).
7552          * This causes a mismatched s->error_count
7553          * and a bad error table. Further, the error count goes to
7554          * INT_MAX when called for bottom field, because mb_y is
7555          * past end by one (callers fault) and resync_mb_y != 0
7556          * causes problems for the first MB line, too.
7557          */
7558         if (!FIELD_PICTURE)
7559             ff_er_frame_end(s);
7560
7561         MPV_frame_end(s);
7562
7563         if (cur->field_poc[0]==INT_MAX || cur->field_poc[1]==INT_MAX) {
7564             /* Wait for second field. */
7565             *data_size = 0;
7566
7567         } else {
7568             cur->repeat_pict = 0;
7569
7570             /* Signal interlacing information externally. */
7571             /* Prioritize picture timing SEI information over used decoding process if it exists. */
7572             if(h->sps.pic_struct_present_flag){
7573                 switch (h->sei_pic_struct)
7574                 {
7575                 case SEI_PIC_STRUCT_FRAME:
7576                     cur->interlaced_frame = 0;
7577                     break;
7578                 case SEI_PIC_STRUCT_TOP_FIELD:
7579                 case SEI_PIC_STRUCT_BOTTOM_FIELD:
7580                 case SEI_PIC_STRUCT_TOP_BOTTOM:
7581                 case SEI_PIC_STRUCT_BOTTOM_TOP:
7582                     cur->interlaced_frame = 1;
7583                     break;
7584                 case SEI_PIC_STRUCT_TOP_BOTTOM_TOP:
7585                 case SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM:
7586                     // Signal the possibility of telecined film externally (pic_struct 5,6)
7587                     // From these hints, let the applications decide if they apply deinterlacing.
7588                     cur->repeat_pict = 1;
7589                     cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7590                     break;
7591                 case SEI_PIC_STRUCT_FRAME_DOUBLING:
7592                     // Force progressive here, as doubling interlaced frame is a bad idea.
7593                     cur->interlaced_frame = 0;
7594                     cur->repeat_pict = 2;
7595                     break;
7596                 case SEI_PIC_STRUCT_FRAME_TRIPLING:
7597                     cur->interlaced_frame = 0;
7598                     cur->repeat_pict = 4;
7599                     break;
7600                 }
7601             }else{
7602                 /* Derive interlacing flag from used decoding process. */
7603                 cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7604             }
7605
7606             if (cur->field_poc[0] != cur->field_poc[1]){
7607                 /* Derive top_field_first from field pocs. */
7608                 cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7609             }else{
7610                 if(cur->interlaced_frame || h->sps.pic_struct_present_flag){
7611                     /* Use picture timing SEI information. Even if it is a information of a past frame, better than nothing. */
7612                     if(h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM
7613                       || h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM_TOP)
7614                         cur->top_field_first = 1;
7615                     else
7616                         cur->top_field_first = 0;
7617                 }else{
7618                     /* Most likely progressive */
7619                     cur->top_field_first = 0;
7620                 }
7621             }
7622
7623         //FIXME do something with unavailable reference frames
7624
7625             /* Sort B-frames into display order */
7626
7627             if(h->sps.bitstream_restriction_flag
7628                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7629                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7630                 s->low_delay = 0;
7631             }
7632
7633             if(   s->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT
7634                && !h->sps.bitstream_restriction_flag){
7635                 s->avctx->has_b_frames= MAX_DELAYED_PIC_COUNT;
7636                 s->low_delay= 0;
7637             }
7638
7639             pics = 0;
7640             while(h->delayed_pic[pics]) pics++;
7641
7642             assert(pics <= MAX_DELAYED_PIC_COUNT);
7643
7644             h->delayed_pic[pics++] = cur;
7645             if(cur->reference == 0)
7646                 cur->reference = DELAYED_PIC_REF;
7647
7648             out = h->delayed_pic[0];
7649             out_idx = 0;
7650             for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7651                 if(h->delayed_pic[i]->poc < out->poc){
7652                     out = h->delayed_pic[i];
7653                     out_idx = i;
7654                 }
7655             cross_idr = !h->delayed_pic[0]->poc || !!h->delayed_pic[i] || h->delayed_pic[0]->key_frame;
7656
7657             out_of_order = !cross_idr && out->poc < h->outputed_poc;
7658
7659             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7660                 { }
7661             else if((out_of_order && pics-1 == s->avctx->has_b_frames && s->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT)
7662                || (s->low_delay &&
7663                 ((!cross_idr && out->poc > h->outputed_poc + 2)
7664                  || cur->pict_type == FF_B_TYPE)))
7665             {
7666                 s->low_delay = 0;
7667                 s->avctx->has_b_frames++;
7668             }
7669
7670             if(out_of_order || pics > s->avctx->has_b_frames){
7671                 out->reference &= ~DELAYED_PIC_REF;
7672                 for(i=out_idx; h->delayed_pic[i]; i++)
7673                     h->delayed_pic[i] = h->delayed_pic[i+1];
7674             }
7675             if(!out_of_order && pics > s->avctx->has_b_frames){
7676                 *data_size = sizeof(AVFrame);
7677
7678                 h->outputed_poc = out->poc;
7679                 *pict= *(AVFrame*)out;
7680             }else{
7681                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7682             }
7683         }
7684     }
7685
7686     assert(pict->data[0] || !*data_size);
7687     ff_print_debug_info(s, pict);
7688 //printf("out %d\n", (int)pict->data[0]);
7689 #if 0 //?
7690
7691     /* Return the Picture timestamp as the frame number */
7692     /* we subtract 1 because it is added on utils.c     */
7693     avctx->frame_number = s->picture_number - 1;
7694 #endif
7695     return get_consumed_bytes(s, buf_index, buf_size);
7696 }
7697 #if 0
7698 static inline void fill_mb_avail(H264Context *h){
7699     MpegEncContext * const s = &h->s;
7700     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7701
7702     if(s->mb_y){
7703         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7704         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7705         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7706     }else{
7707         h->mb_avail[0]=
7708         h->mb_avail[1]=
7709         h->mb_avail[2]= 0;
7710     }
7711     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7712     h->mb_avail[4]= 1; //FIXME move out
7713     h->mb_avail[5]= 0; //FIXME move out
7714 }
7715 #endif
7716
7717 #ifdef TEST
7718 #undef printf
7719 #undef random
7720 #define COUNT 8000
7721 #define SIZE (COUNT*40)
7722 int main(void){
7723     int i;
7724     uint8_t temp[SIZE];
7725     PutBitContext pb;
7726     GetBitContext gb;
7727 //    int int_temp[10000];
7728     DSPContext dsp;
7729     AVCodecContext avctx;
7730
7731     dsputil_init(&dsp, &avctx);
7732
7733     init_put_bits(&pb, temp, SIZE);
7734     printf("testing unsigned exp golomb\n");
7735     for(i=0; i<COUNT; i++){
7736         START_TIMER
7737         set_ue_golomb(&pb, i);
7738         STOP_TIMER("set_ue_golomb");
7739     }
7740     flush_put_bits(&pb);
7741
7742     init_get_bits(&gb, temp, 8*SIZE);
7743     for(i=0; i<COUNT; i++){
7744         int j, s;
7745
7746         s= show_bits(&gb, 24);
7747
7748         START_TIMER
7749         j= get_ue_golomb(&gb);
7750         if(j != i){
7751             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7752 //            return -1;
7753         }
7754         STOP_TIMER("get_ue_golomb");
7755     }
7756
7757
7758     init_put_bits(&pb, temp, SIZE);
7759     printf("testing signed exp golomb\n");
7760     for(i=0; i<COUNT; i++){
7761         START_TIMER
7762         set_se_golomb(&pb, i - COUNT/2);
7763         STOP_TIMER("set_se_golomb");
7764     }
7765     flush_put_bits(&pb);
7766
7767     init_get_bits(&gb, temp, 8*SIZE);
7768     for(i=0; i<COUNT; i++){
7769         int j, s;
7770
7771         s= show_bits(&gb, 24);
7772
7773         START_TIMER
7774         j= get_se_golomb(&gb);
7775         if(j != i - COUNT/2){
7776             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7777 //            return -1;
7778         }
7779         STOP_TIMER("get_se_golomb");
7780     }
7781
7782 #if 0
7783     printf("testing 4x4 (I)DCT\n");
7784
7785     DCTELEM block[16];
7786     uint8_t src[16], ref[16];
7787     uint64_t error= 0, max_error=0;
7788
7789     for(i=0; i<COUNT; i++){
7790         int j;
7791 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
7792         for(j=0; j<16; j++){
7793             ref[j]= random()%255;
7794             src[j]= random()%255;
7795         }
7796
7797         h264_diff_dct_c(block, src, ref, 4);
7798
7799         //normalize
7800         for(j=0; j<16; j++){
7801 //            printf("%d ", block[j]);
7802             block[j]= block[j]*4;
7803             if(j&1) block[j]= (block[j]*4 + 2)/5;
7804             if(j&4) block[j]= (block[j]*4 + 2)/5;
7805         }
7806 //        printf("\n");
7807
7808         s->dsp.h264_idct_add(ref, block, 4);
7809 /*        for(j=0; j<16; j++){
7810             printf("%d ", ref[j]);
7811         }
7812         printf("\n");*/
7813
7814         for(j=0; j<16; j++){
7815             int diff= FFABS(src[j] - ref[j]);
7816
7817             error+= diff*diff;
7818             max_error= FFMAX(max_error, diff);
7819         }
7820     }
7821     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
7822     printf("testing quantizer\n");
7823     for(qp=0; qp<52; qp++){
7824         for(i=0; i<16; i++)
7825             src1_block[i]= src2_block[i]= random()%255;
7826
7827     }
7828     printf("Testing NAL layer\n");
7829
7830     uint8_t bitstream[COUNT];
7831     uint8_t nal[COUNT*2];
7832     H264Context h;
7833     memset(&h, 0, sizeof(H264Context));
7834
7835     for(i=0; i<COUNT; i++){
7836         int zeros= i;
7837         int nal_length;
7838         int consumed;
7839         int out_length;
7840         uint8_t *out;
7841         int j;
7842
7843         for(j=0; j<COUNT; j++){
7844             bitstream[j]= (random() % 255) + 1;
7845         }
7846
7847         for(j=0; j<zeros; j++){
7848             int pos= random() % COUNT;
7849             while(bitstream[pos] == 0){
7850                 pos++;
7851                 pos %= COUNT;
7852             }
7853             bitstream[pos]=0;
7854         }
7855
7856         START_TIMER
7857
7858         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
7859         if(nal_length<0){
7860             printf("encoding failed\n");
7861             return -1;
7862         }
7863
7864         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
7865
7866         STOP_TIMER("NAL")
7867
7868         if(out_length != COUNT){
7869             printf("incorrect length %d %d\n", out_length, COUNT);
7870             return -1;
7871         }
7872
7873         if(consumed != nal_length){
7874             printf("incorrect consumed length %d %d\n", nal_length, consumed);
7875             return -1;
7876         }
7877
7878         if(memcmp(bitstream, out, COUNT)){
7879             printf("mismatch\n");
7880             return -1;
7881         }
7882     }
7883 #endif
7884
7885     printf("Testing RBSP\n");
7886
7887
7888     return 0;
7889 }
7890 #endif /* TEST */
7891
7892
7893 static av_cold int decode_end(AVCodecContext *avctx)
7894 {
7895     H264Context *h = avctx->priv_data;
7896     MpegEncContext *s = &h->s;
7897     int i;
7898
7899     av_freep(&h->rbsp_buffer[0]);
7900     av_freep(&h->rbsp_buffer[1]);
7901     free_tables(h); //FIXME cleanup init stuff perhaps
7902
7903     for(i = 0; i < MAX_SPS_COUNT; i++)
7904         av_freep(h->sps_buffers + i);
7905
7906     for(i = 0; i < MAX_PPS_COUNT; i++)
7907         av_freep(h->pps_buffers + i);
7908
7909     MPV_common_end(s);
7910
7911 //    memset(h, 0, sizeof(H264Context));
7912
7913     return 0;
7914 }
7915
7916
7917 AVCodec h264_decoder = {
7918     "h264",
7919     CODEC_TYPE_VIDEO,
7920     CODEC_ID_H264,
7921     sizeof(H264Context),
7922     decode_init,
7923     NULL,
7924     decode_end,
7925     decode_frame,
7926     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_DELAY,
7927     .flush= flush_dpb,
7928     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
7929 };
7930
7931 #include "svq3.c"