libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file libavcodec/h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264.h"
  32 #include "h264data.h"
  33 #include "h264_parser.h"
  34 #include "golomb.h"
  35 #include "mathops.h"
  36 #include "rectangle.h"
  37 #include "vdpau_internal.h"
  38
  39 #include "cabac.h"
  40 #if ARCH_X86
  41 #include "x86/h264_i386.h"
  42 #endif
  43
  44 //#undef NDEBUG
  45 #include <assert.h>
  46
  47 /**
  48  * Value of Picture.reference when Picture is not a reference picture, but
  49  * is held for delayed output.
  50  */
  51 #define DELAYED_PIC_REF 4
  52
  53 static VLC coeff_token_vlc[4];
  54 static VLC_TYPE coeff_token_vlc_tables[520+332+280+256][2];
  55 static const int coeff_token_vlc_tables_size[4]={520,332,280,256};
  56
  57 static VLC chroma_dc_coeff_token_vlc;
  58 static VLC_TYPE chroma_dc_coeff_token_vlc_table[256][2];
  59 static const int chroma_dc_coeff_token_vlc_table_size = 256;
  60
  61 static VLC total_zeros_vlc[15];
  62 static VLC_TYPE total_zeros_vlc_tables[15][512][2];
  63 static const int total_zeros_vlc_tables_size = 512;
  64
  65 static VLC chroma_dc_total_zeros_vlc[3];
  66 static VLC_TYPE chroma_dc_total_zeros_vlc_tables[3][8][2];
  67 static const int chroma_dc_total_zeros_vlc_tables_size = 8;
  68
  69 static VLC run_vlc[6];
  70 static VLC_TYPE run_vlc_tables[6][8][2];
  71 static const int run_vlc_tables_size = 8;
  72
  73 static VLC run7_vlc;
  74 static VLC_TYPE run7_vlc_table[96][2];
  75 static const int run7_vlc_table_size = 96;
  76
  77 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  78 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  79 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  80 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  81 static Picture * remove_long(H264Context *h, int i, int ref_mask);
  82
  83 static av_always_inline uint32_t pack16to32(int a, int b){
  84 #ifdef WORDS_BIGENDIAN
  85    return (b&0xFFFF) + (a<<16);
  86 #else
  87    return (a&0xFFFF) + (b<<16);
  88 #endif
  89 }
  90
  91 static const uint8_t rem6[52]={
  92 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  93 };
  94
  95 static const uint8_t div6[52]={
  96 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  97 };
  98
  99 static const int left_block_options[4][8]={
 100     {0,1,2,3,7,10,8,11},
 101     {2,2,3,3,8,11,8,11},
 102     {0,0,1,1,7,10,7,10},
 103     {0,2,0,2,7,10,7,10}
 104 };
 105
 106 #define LEVEL_TAB_BITS 8
 107 static int8_t cavlc_level_tab[7][1<<LEVEL_TAB_BITS][2];
 108
 109 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 110     MpegEncContext * const s = &h->s;
 111     const int mb_xy= h->mb_xy;
 112     int topleft_xy, top_xy, topright_xy, left_xy[2];
 113     int topleft_type, top_type, topright_type, left_type[2];
 114     const int * left_block;
 115     int topleft_partition= -1;
 116     int i;
 117
 118     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
 119
 120     //FIXME deblocking could skip the intra and nnz parts.
 121     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
 122         return;
 123
 124     /* Wow, what a mess, why didn't they simplify the interlacing & intra
 125      * stuff, I can't imagine that these complex rules are worth it. */
 126
 127     topleft_xy = top_xy - 1;
 128     topright_xy= top_xy + 1;
 129     left_xy[1] = left_xy[0] = mb_xy-1;
 130     left_block = left_block_options[0];
 131     if(FRAME_MBAFF){
 132         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 133         const int top_pair_xy      = pair_xy     - s->mb_stride;
 134         const int topleft_pair_xy  = top_pair_xy - 1;
 135         const int topright_pair_xy = top_pair_xy + 1;
 136         const int topleft_mb_field_flag  = IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 137         const int top_mb_field_flag      = IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 138         const int topright_mb_field_flag = IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 139         const int left_mb_field_flag     = IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 140         const int curr_mb_field_flag     = IS_INTERLACED(mb_type);
 141         const int bottom = (s->mb_y & 1);
 142         tprintf(s->avctx, "fill_caches: curr_mb_field_flag:%d, left_mb_field_flag:%d, topleft_mb_field_flag:%d, top_mb_field_flag:%d, topright_mb_field_flag:%d\n", curr_mb_field_flag, left_mb_field_flag, topleft_mb_field_flag, top_mb_field_flag, topright_mb_field_flag);
 143
 144         if (curr_mb_field_flag && (bottom || top_mb_field_flag)){
 145             top_xy -= s->mb_stride;
 146         }
 147         if (curr_mb_field_flag && (bottom || topleft_mb_field_flag)){
 148             topleft_xy -= s->mb_stride;
 149         } else if(bottom && !curr_mb_field_flag && left_mb_field_flag) {
 150             topleft_xy += s->mb_stride;
 151             // take top left mv from the middle of the mb, as opposed to all other modes which use the bottom right partition
 152             topleft_partition = 0;
 153         }
 154         if (curr_mb_field_flag && (bottom || topright_mb_field_flag)){
 155             topright_xy -= s->mb_stride;
 156         }
 157         if (left_mb_field_flag != curr_mb_field_flag) {
 158             left_xy[1] = left_xy[0] = pair_xy - 1;
 159             if (curr_mb_field_flag) {
 160                 left_xy[1] += s->mb_stride;
 161                 left_block = left_block_options[3];
 162             } else {
 163                 left_block= left_block_options[2 - bottom];
 164             }
 165         }
 166     }
 167
 168     h->top_mb_xy = top_xy;
 169     h->left_mb_xy[0] = left_xy[0];
 170     h->left_mb_xy[1] = left_xy[1];
 171     if(for_deblock){
 172         topleft_type = 0;
 173         topright_type = 0;
 174         top_type     = h->slice_table[top_xy     ] < 0xFFFF ? s->current_picture.mb_type[top_xy]     : 0;
 175         left_type[0] = h->slice_table[left_xy[0] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[0]] : 0;
 176         left_type[1] = h->slice_table[left_xy[1] ] < 0xFFFF ? s->current_picture.mb_type[left_xy[1]] : 0;
 177
 178         if(MB_MBAFF && !IS_INTRA(mb_type)){
 179             int list;
 180             for(list=0; list<h->list_count; list++){
 181                 //These values where changed for ease of performing MC, we need to change them back
 182                 //FIXME maybe we can make MC and loop filter use the same values or prevent
 183                 //the MC code from changing ref_cache and rather use a temporary array.
 184                 if(USES_LIST(mb_type,list)){
 185                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 186                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 187                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 188                     ref += h->b8_stride;
 189                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 190                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = (pack16to32(ref[0],ref[1])&0x00FF00FF)*0x0101;
 191                 }
 192             }
 193         }
 194     }else{
 195         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 196         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 197         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 198         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 199         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 200
 201     if(IS_INTRA(mb_type)){
 202         int type_mask= h->pps.constrained_intra_pred ? IS_INTRA(-1) : -1;
 203         h->topleft_samples_available=
 204         h->top_samples_available=
 205         h->left_samples_available= 0xFFFF;
 206         h->topright_samples_available= 0xEEEA;
 207
 208         if(!(top_type & type_mask)){
 209             h->topleft_samples_available= 0xB3FF;
 210             h->top_samples_available= 0x33FF;
 211             h->topright_samples_available= 0x26EA;
 212         }
 213         if(IS_INTERLACED(mb_type) != IS_INTERLACED(left_type[0])){
 214             if(IS_INTERLACED(mb_type)){
 215                 if(!(left_type[0] & type_mask)){
 216                     h->topleft_samples_available&= 0xDFFF;
 217                     h->left_samples_available&= 0x5FFF;
 218                 }
 219                 if(!(left_type[1] & type_mask)){
 220                     h->topleft_samples_available&= 0xFF5F;
 221                     h->left_samples_available&= 0xFF5F;
 222                 }
 223             }else{
 224                 int left_typei = h->slice_table[left_xy[0] + s->mb_stride ] == h->slice_num
 225                                 ? s->current_picture.mb_type[left_xy[0] + s->mb_stride] : 0;
 226                 assert(left_xy[0] == left_xy[1]);
 227                 if(!((left_typei & type_mask) && (left_type[0] & type_mask))){
 228                     h->topleft_samples_available&= 0xDF5F;
 229                     h->left_samples_available&= 0x5F5F;
 230                 }
 231             }
 232         }else{
 233             if(!(left_type[0] & type_mask)){
 234                 h->topleft_samples_available&= 0xDF5F;
 235                 h->left_samples_available&= 0x5F5F;
 236             }
 237         }
 238
 239         if(!(topleft_type & type_mask))
 240             h->topleft_samples_available&= 0x7FFF;
 241
 242         if(!(topright_type & type_mask))
 243             h->topright_samples_available&= 0xFBFF;
 244
 245         if(IS_INTRA4x4(mb_type)){
 246             if(IS_INTRA4x4(top_type)){
 247                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 248                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 249                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 250                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 251             }else{
 252                 int pred;
 253                 if(!(top_type & type_mask))
 254                     pred= -1;
 255                 else{
 256                     pred= 2;
 257                 }
 258                 h->intra4x4_pred_mode_cache[4+8*0]=
 259                 h->intra4x4_pred_mode_cache[5+8*0]=
 260                 h->intra4x4_pred_mode_cache[6+8*0]=
 261                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 262             }
 263             for(i=0; i<2; i++){
 264                 if(IS_INTRA4x4(left_type[i])){
 265                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 266                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 267                 }else{
 268                     int pred;
 269                     if(!(left_type[i] & type_mask))
 270                         pred= -1;
 271                     else{
 272                         pred= 2;
 273                     }
 274                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 275                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 276                 }
 277             }
 278         }
 279     }
 280     }
 281
 282
 283 /*
 284 0 . T T. T T T T
 285 1 L . .L . . . .
 286 2 L . .L . . . .
 287 3 . T TL . . . .
 288 4 L . .L . . . .
 289 5 L . .. . . . .
 290 */
 291 //FIXME constraint_intra_pred & partitioning & nnz (let us hope this is just a typo in the spec)
 292     if(top_type){
 293         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 294         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 295         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 296         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 297
 298         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 299         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 300
 301         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 302         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 303
 304     }else{
 305         h->non_zero_count_cache[4+8*0]=
 306         h->non_zero_count_cache[5+8*0]=
 307         h->non_zero_count_cache[6+8*0]=
 308         h->non_zero_count_cache[7+8*0]=
 309
 310         h->non_zero_count_cache[1+8*0]=
 311         h->non_zero_count_cache[2+8*0]=
 312
 313         h->non_zero_count_cache[1+8*3]=
 314         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 315
 316     }
 317
 318     for (i=0; i<2; i++) {
 319         if(left_type[i]){
 320             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 321             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 322             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 323             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 324         }else{
 325             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 326             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 327             h->non_zero_count_cache[0+8*1 +   8*i]=
 328             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 329         }
 330     }
 331
 332     if( h->pps.cabac ) {
 333         // top_cbp
 334         if(top_type) {
 335             h->top_cbp = h->cbp_table[top_xy];
 336         } else if(IS_INTRA(mb_type)) {
 337             h->top_cbp = 0x1C0;
 338         } else {
 339             h->top_cbp = 0;
 340         }
 341         // left_cbp
 342         if (left_type[0]) {
 343             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 344         } else if(IS_INTRA(mb_type)) {
 345             h->left_cbp = 0x1C0;
 346         } else {
 347             h->left_cbp = 0;
 348         }
 349         if (left_type[0]) {
 350             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 351         }
 352         if (left_type[1]) {
 353             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 354         }
 355     }
 356
 357 #if 1
 358     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 359         int list;
 360         for(list=0; list<h->list_count; list++){
 361             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 362                 /*if(!h->mv_cache_clean[list]){
 363                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 364                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 365                     h->mv_cache_clean[list]= 1;
 366                 }*/
 367                 continue;
 368             }
 369             h->mv_cache_clean[list]= 0;
 370
 371             if(USES_LIST(top_type, list)){
 372                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 373                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 374                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 375                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 376                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 377                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 378                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 379                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 380                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 381                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 382             }else{
 383                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 384                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 385                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 386                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 387                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 388             }
 389
 390             for(i=0; i<2; i++){
 391                 int cache_idx = scan8[0] - 1 + i*2*8;
 392                 if(USES_LIST(left_type[i], list)){
 393                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 394                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 395                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 396                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 397                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 398                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 399                 }else{
 400                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 401                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 402                     h->ref_cache[list][cache_idx  ]=
 403                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 404                 }
 405             }
 406
 407             if(for_deblock || ((IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred) && !FRAME_MBAFF))
 408                 continue;
 409
 410             if(USES_LIST(topleft_type, list)){
 411                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 412                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 413                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 414                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 415             }else{
 416                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 417                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 418             }
 419
 420             if(USES_LIST(topright_type, list)){
 421                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 422                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 423                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 424                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 425             }else{
 426                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 427                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 428             }
 429
 430             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 431                 continue;
 432
 433             h->ref_cache[list][scan8[5 ]+1] =
 434             h->ref_cache[list][scan8[7 ]+1] =
 435             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 436             h->ref_cache[list][scan8[4 ]] =
 437             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 438             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 439             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 440             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 441             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 442             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 443
 444             if( h->pps.cabac ) {
 445                 /* XXX beurk, Load mvd */
 446                 if(USES_LIST(top_type, list)){
 447                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 448                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 449                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 450                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 451                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 452                 }else{
 453                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 454                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 455                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 456                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 457                 }
 458                 if(USES_LIST(left_type[0], list)){
 459                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 460                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 461                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 462                 }else{
 463                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 464                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 465                 }
 466                 if(USES_LIST(left_type[1], list)){
 467                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 468                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 469                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 470                 }else{
 471                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 472                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 473                 }
 474                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 475                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 476                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 477                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 478                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 479
 480                 if(h->slice_type_nos == FF_B_TYPE){
 481                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 482
 483                     if(IS_DIRECT(top_type)){
 484                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 485                     }else if(IS_8X8(top_type)){
 486                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 487                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 488                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 489                     }else{
 490                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 491                     }
 492
 493                     if(IS_DIRECT(left_type[0]))
 494                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 495                     else if(IS_8X8(left_type[0]))
 496                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 497                     else
 498                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 499
 500                     if(IS_DIRECT(left_type[1]))
 501                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 502                     else if(IS_8X8(left_type[1]))
 503                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 504                     else
 505                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 506                 }
 507             }
 508
 509             if(FRAME_MBAFF){
 510 #define MAP_MVS\
 511                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 512                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 513                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 514                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 515                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 516                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 517                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 518                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 519                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 520                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 521                 if(MB_FIELD){
 522 #define MAP_F2F(idx, mb_type)\
 523                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 524                         h->ref_cache[list][idx] <<= 1;\
 525                         h->mv_cache[list][idx][1] /= 2;\
 526                         h->mvd_cache[list][idx][1] /= 2;\
 527                     }
 528                     MAP_MVS
 529 #undef MAP_F2F
 530                 }else{
 531 #define MAP_F2F(idx, mb_type)\
 532                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 533                         h->ref_cache[list][idx] >>= 1;\
 534                         h->mv_cache[list][idx][1] <<= 1;\
 535                         h->mvd_cache[list][idx][1] <<= 1;\
 536                     }
 537                     MAP_MVS
 538 #undef MAP_F2F
 539                 }
 540             }
 541         }
 542     }
 543 #endif
 544
 545     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 546 }
 547
 548 static inline void write_back_intra_pred_mode(H264Context *h){
 549     const int mb_xy= h->mb_xy;
 550
 551     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 552     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 553     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 554     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 555     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 556     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 557     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 558 }
 559
 560 /**
 561  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 562  */
 563 static inline int check_intra4x4_pred_mode(H264Context *h){
 564     MpegEncContext * const s = &h->s;
 565     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 566     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 567     int i;
 568
 569     if(!(h->top_samples_available&0x8000)){
 570         for(i=0; i<4; i++){
 571             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 572             if(status<0){
 573                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 574                 return -1;
 575             } else if(status){
 576                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 577             }
 578         }
 579     }
 580
 581     if((h->left_samples_available&0x8888)!=0x8888){
 582         static const int mask[4]={0x8000,0x2000,0x80,0x20};
 583         for(i=0; i<4; i++){
 584             if(!(h->left_samples_available&mask[i])){
 585                 int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 586                 if(status<0){
 587                     av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 588                     return -1;
 589                 } else if(status){
 590                     h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 591                 }
 592             }
 593         }
 594     }
 595
 596     return 0;
 597 } //FIXME cleanup like next
 598
 599 /**
 600  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 601  */
 602 static inline int check_intra_pred_mode(H264Context *h, int mode){
 603     MpegEncContext * const s = &h->s;
 604     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 605     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 606
 607     if(mode > 6U) {
 608         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 609         return -1;
 610     }
 611
 612     if(!(h->top_samples_available&0x8000)){
 613         mode= top[ mode ];
 614         if(mode<0){
 615             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 616             return -1;
 617         }
 618     }
 619
 620     if((h->left_samples_available&0x8080) != 0x8080){
 621         mode= left[ mode ];
 622         if(h->left_samples_available&0x8080){ //mad cow disease mode, aka MBAFF + constrained_intra_pred
 623             mode= ALZHEIMER_DC_L0T_PRED8x8 + (!(h->left_samples_available&0x8000)) + 2*(mode == DC_128_PRED8x8);
 624         }
 625         if(mode<0){
 626             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 627             return -1;
 628         }
 629     }
 630
 631     return mode;
 632 }
 633
 634 /**
 635  * gets the predicted intra4x4 prediction mode.
 636  */
 637 static inline int pred_intra_mode(H264Context *h, int n){
 638     const int index8= scan8[n];
 639     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 640     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 641     const int min= FFMIN(left, top);
 642
 643     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 644
 645     if(min<0) return DC_PRED;
 646     else      return min;
 647 }
 648
 649 static inline void write_back_non_zero_count(H264Context *h){
 650     const int mb_xy= h->mb_xy;
 651
 652     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 653     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 654     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 655     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 656     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 657     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 658     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 659
 660     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 661     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 662     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 663
 664     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 665     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 666     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 667 }
 668
 669 /**
 670  * gets the predicted number of non-zero coefficients.
 671  * @param n block index
 672  */
 673 static inline int pred_non_zero_count(H264Context *h, int n){
 674     const int index8= scan8[n];
 675     const int left= h->non_zero_count_cache[index8 - 1];
 676     const int top = h->non_zero_count_cache[index8 - 8];
 677     int i= left + top;
 678
 679     if(i<64) i= (i+1)>>1;
 680
 681     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 682
 683     return i&31;
 684 }
 685
 686 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 687     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 688     MpegEncContext *s = &h->s;
 689
 690     /* there is no consistent mapping of mvs to neighboring locations that will
 691      * make mbaff happy, so we can't move all this logic to fill_caches */
 692     if(FRAME_MBAFF){
 693         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 694         const int16_t *mv;
 695         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 696         *C = h->mv_cache[list][scan8[0]-2];
 697
 698         if(!MB_FIELD
 699            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 700             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 701             if(IS_INTERLACED(mb_types[topright_xy])){
 702 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 703                 const int x4 = X4, y4 = Y4;\
 704                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 705                 if(!USES_LIST(mb_type,list))\
 706                     return LIST_NOT_USED;\
 707                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 708                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 709                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 710                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 711
 712                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 713             }
 714         }
 715         if(topright_ref == PART_NOT_AVAILABLE
 716            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 717            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 718             if(!MB_FIELD
 719                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 720                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 721             }
 722             if(MB_FIELD
 723                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 724                && i >= scan8[0]+8){
 725                 // left shift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's OK.
 726                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 727             }
 728         }
 729 #undef SET_DIAG_MV
 730     }
 731
 732     if(topright_ref != PART_NOT_AVAILABLE){
 733         *C= h->mv_cache[list][ i - 8 + part_width ];
 734         return topright_ref;
 735     }else{
 736         tprintf(s->avctx, "topright MV not available\n");
 737
 738         *C= h->mv_cache[list][ i - 8 - 1 ];
 739         return h->ref_cache[list][ i - 8 - 1 ];
 740     }
 741 }
 742
 743 /**
 744  * gets the predicted MV.
 745  * @param n the block index
 746  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 747  * @param mx the x component of the predicted motion vector
 748  * @param my the y component of the predicted motion vector
 749  */
 750 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 751     const int index8= scan8[n];
 752     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 753     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 754     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 755     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 756     const int16_t * C;
 757     int diagonal_ref, match_count;
 758
 759     assert(part_width==1 || part_width==2 || part_width==4);
 760
 761 /* mv_cache
 762   B . . A T T T T
 763   U . . L . . , .
 764   U . . L . . . .
 765   U . . L . . , .
 766   . . . L . . . .
 767 */
 768
 769     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 770     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 771     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 772     if(match_count > 1){ //most common
 773         *mx= mid_pred(A[0], B[0], C[0]);
 774         *my= mid_pred(A[1], B[1], C[1]);
 775     }else if(match_count==1){
 776         if(left_ref==ref){
 777             *mx= A[0];
 778             *my= A[1];
 779         }else if(top_ref==ref){
 780             *mx= B[0];
 781             *my= B[1];
 782         }else{
 783             *mx= C[0];
 784             *my= C[1];
 785         }
 786     }else{
 787         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 788             *mx= A[0];
 789             *my= A[1];
 790         }else{
 791             *mx= mid_pred(A[0], B[0], C[0]);
 792             *my= mid_pred(A[1], B[1], C[1]);
 793         }
 794     }
 795
 796     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 797 }
 798
 799 /**
 800  * gets the directionally predicted 16x8 MV.
 801  * @param n the block index
 802  * @param mx the x component of the predicted motion vector
 803  * @param my the y component of the predicted motion vector
 804  */
 805 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 806     if(n==0){
 807         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 808         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 809
 810         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 811
 812         if(top_ref == ref){
 813             *mx= B[0];
 814             *my= B[1];
 815             return;
 816         }
 817     }else{
 818         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 819         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 820
 821         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 822
 823         if(left_ref == ref){
 824             *mx= A[0];
 825             *my= A[1];
 826             return;
 827         }
 828     }
 829
 830     //RARE
 831     pred_motion(h, n, 4, list, ref, mx, my);
 832 }
 833
 834 /**
 835  * gets the directionally predicted 8x16 MV.
 836  * @param n the block index
 837  * @param mx the x component of the predicted motion vector
 838  * @param my the y component of the predicted motion vector
 839  */
 840 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 841     if(n==0){
 842         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 843         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 844
 845         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 846
 847         if(left_ref == ref){
 848             *mx= A[0];
 849             *my= A[1];
 850             return;
 851         }
 852     }else{
 853         const int16_t * C;
 854         int diagonal_ref;
 855
 856         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 857
 858         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 859
 860         if(diagonal_ref == ref){
 861             *mx= C[0];
 862             *my= C[1];
 863             return;
 864         }
 865     }
 866
 867     //RARE
 868     pred_motion(h, n, 2, list, ref, mx, my);
 869 }
 870
 871 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 872     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 873     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 874
 875     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 876
 877     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 878        || !( top_ref | *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ])
 879        || !(left_ref | *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ])){
 880
 881         *mx = *my = 0;
 882         return;
 883     }
 884
 885     pred_motion(h, 0, 4, 0, 0, mx, my);
 886
 887     return;
 888 }
 889
 890 static int get_scale_factor(H264Context * const h, int poc, int poc1, int i){
 891     int poc0 = h->ref_list[0][i].poc;
 892     int td = av_clip(poc1 - poc0, -128, 127);
 893     if(td == 0 || h->ref_list[0][i].long_ref){
 894         return 256;
 895     }else{
 896         int tb = av_clip(poc - poc0, -128, 127);
 897         int tx = (16384 + (FFABS(td) >> 1)) / td;
 898         return av_clip((tb*tx + 32) >> 6, -1024, 1023);
 899     }
 900 }
 901
 902 static inline void direct_dist_scale_factor(H264Context * const h){
 903     MpegEncContext * const s = &h->s;
 904     const int poc = h->s.current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
 905     const int poc1 = h->ref_list[1][0].poc;
 906     int i, field;
 907     for(field=0; field<2; field++){
 908         const int poc  = h->s.current_picture_ptr->field_poc[field];
 909         const int poc1 = h->ref_list[1][0].field_poc[field];
 910         for(i=0; i < 2*h->ref_count[0]; i++)
 911             h->dist_scale_factor_field[field][i^field] = get_scale_factor(h, poc, poc1, i+16);
 912     }
 913
 914     for(i=0; i<h->ref_count[0]; i++){
 915         h->dist_scale_factor[i] = get_scale_factor(h, poc, poc1, i);
 916     }
 917 }
 918
 919 static void fill_colmap(H264Context *h, int map[2][16+32], int list, int field, int colfield, int mbafi){
 920     MpegEncContext * const s = &h->s;
 921     Picture * const ref1 = &h->ref_list[1][0];
 922     int j, old_ref, rfield;
 923     int start= mbafi ? 16                      : 0;
 924     int end  = mbafi ? 16+2*h->ref_count[list] : h->ref_count[list];
 925     int interl= mbafi || s->picture_structure != PICT_FRAME;
 926
 927     /* bogus; fills in for missing frames */
 928     memset(map[list], 0, sizeof(map[list]));
 929
 930     for(rfield=0; rfield<2; rfield++){
 931         for(old_ref=0; old_ref<ref1->ref_count[colfield][list]; old_ref++){
 932             int poc = ref1->ref_poc[colfield][list][old_ref];
 933
 934             if     (!interl)
 935                 poc |= 3;
 936             else if( interl && (poc&3) == 3) //FIXME store all MBAFF references so this isnt needed
 937                 poc= (poc&~3) + rfield + 1;
 938
 939             for(j=start; j<end; j++){
 940                 if(4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3) == poc){
 941                     int cur_ref= mbafi ? (j-16)^field : j;
 942                     map[list][2*old_ref + (rfield^field) + 16] = cur_ref;
 943                     if(rfield == field)
 944                         map[list][old_ref] = cur_ref;
 945                     break;
 946                 }
 947             }
 948         }
 949     }
 950 }
 951
 952 static inline void direct_ref_list_init(H264Context * const h){
 953     MpegEncContext * const s = &h->s;
 954     Picture * const ref1 = &h->ref_list[1][0];
 955     Picture * const cur = s->current_picture_ptr;
 956     int list, j, field;
 957     int sidx= (s->picture_structure&1)^1;
 958     int ref1sidx= (ref1->reference&1)^1;
 959
 960     for(list=0; list<2; list++){
 961         cur->ref_count[sidx][list] = h->ref_count[list];
 962         for(j=0; j<h->ref_count[list]; j++)
 963             cur->ref_poc[sidx][list][j] = 4*h->ref_list[list][j].frame_num + (h->ref_list[list][j].reference&3);
 964     }
 965
 966     if(s->picture_structure == PICT_FRAME){
 967         memcpy(cur->ref_count[1], cur->ref_count[0], sizeof(cur->ref_count[0]));
 968         memcpy(cur->ref_poc  [1], cur->ref_poc  [0], sizeof(cur->ref_poc  [0]));
 969     }
 970
 971     cur->mbaff= FRAME_MBAFF;
 972
 973     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 974         return;
 975
 976     for(list=0; list<2; list++){
 977         fill_colmap(h, h->map_col_to_list0, list, sidx, ref1sidx, 0);
 978         for(field=0; field<2; field++)
 979             fill_colmap(h, h->map_col_to_list0_field[field], list, field, field, 1);
 980     }
 981 }
 982
 983 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 984     MpegEncContext * const s = &h->s;
 985     int b8_stride = h->b8_stride;
 986     int b4_stride = h->b_stride;
 987     int mb_xy = h->mb_xy;
 988     int mb_type_col[2];
 989     const int16_t (*l1mv0)[2], (*l1mv1)[2];
 990     const int8_t *l1ref0, *l1ref1;
 991     const int is_b8x8 = IS_8X8(*mb_type);
 992     unsigned int sub_mb_type;
 993     int i8, i4;
 994
 995 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
 996
 997     if(IS_INTERLACED(h->ref_list[1][0].mb_type[mb_xy])){ // AFL/AFR/FR/FL -> AFL/FL
 998         if(!IS_INTERLACED(*mb_type)){                    //     AFR/FR    -> AFL/FL
 999             int cur_poc = s->current_picture_ptr->poc;
1000             int *col_poc = h->ref_list[1]->field_poc;
1001             int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
1002             mb_xy= s->mb_x + ((s->mb_y&~1) + col_parity)*s->mb_stride;
1003             b8_stride = 0;
1004         }else if(!(s->picture_structure & h->ref_list[1][0].reference) && !h->ref_list[1][0].mbaff){// FL -> FL & differ parity
1005             int fieldoff= 2*(h->ref_list[1][0].reference)-3;
1006             mb_xy += s->mb_stride*fieldoff;
1007         }
1008         goto single_col;
1009     }else{                                               // AFL/AFR/FR/FL -> AFR/FR
1010         if(IS_INTERLACED(*mb_type)){                     // AFL       /FL -> AFR/FR
1011             mb_xy= s->mb_x + (s->mb_y&~1)*s->mb_stride;
1012             mb_type_col[0] = h->ref_list[1][0].mb_type[mb_xy];
1013             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy + s->mb_stride];
1014             b8_stride *= 3;
1015             b4_stride *= 6;
1016             //FIXME IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag
1017             if(    (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)
1018                 && (mb_type_col[1] & MB_TYPE_16x16_OR_INTRA)
1019                 && !is_b8x8){
1020                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1021                 *mb_type   |= MB_TYPE_16x8 |MB_TYPE_L0L1|MB_TYPE_DIRECT2; /* B_16x8 */
1022             }else{
1023                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1024                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1025             }
1026         }else{                                           //     AFR/FR    -> AFR/FR
1027 single_col:
1028             mb_type_col[0] =
1029             mb_type_col[1] = h->ref_list[1][0].mb_type[mb_xy];
1030             if(IS_8X8(mb_type_col[0]) && !h->sps.direct_8x8_inference_flag){
1031                 /* FIXME save sub mb types from previous frames (or derive from MVs)
1032                 * so we know exactly what block size to use */
1033                 sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1034                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1035             }else if(!is_b8x8 && (mb_type_col[0] & MB_TYPE_16x16_OR_INTRA)){
1036                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1037                 *mb_type   |= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1038             }else{
1039                 sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1040                 *mb_type   |= MB_TYPE_8x8|MB_TYPE_L0L1;
1041             }
1042         }
1043     }
1044
1045     l1mv0  = &h->ref_list[1][0].motion_val[0][h->mb2b_xy [mb_xy]];
1046     l1mv1  = &h->ref_list[1][0].motion_val[1][h->mb2b_xy [mb_xy]];
1047     l1ref0 = &h->ref_list[1][0].ref_index [0][h->mb2b8_xy[mb_xy]];
1048     l1ref1 = &h->ref_list[1][0].ref_index [1][h->mb2b8_xy[mb_xy]];
1049     if(!b8_stride){
1050         if(s->mb_y&1){
1051             l1ref0 += h->b8_stride;
1052             l1ref1 += h->b8_stride;
1053             l1mv0  +=  2*b4_stride;
1054             l1mv1  +=  2*b4_stride;
1055         }
1056     }
1057
1058     if(h->direct_spatial_mv_pred){
1059         int ref[2];
1060         int mv[2][2];
1061         int list;
1062
1063         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1064
1065         /* ref = min(neighbors) */
1066         for(list=0; list<2; list++){
1067             int refa = h->ref_cache[list][scan8[0] - 1];
1068             int refb = h->ref_cache[list][scan8[0] - 8];
1069             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1070             if(refc == PART_NOT_AVAILABLE)
1071                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1072             ref[list] = FFMIN3((unsigned)refa, (unsigned)refb, (unsigned)refc);
1073             if(ref[list] < 0)
1074                 ref[list] = -1;
1075         }
1076
1077         if(ref[0] < 0 && ref[1] < 0){
1078             ref[0] = ref[1] = 0;
1079             mv[0][0] = mv[0][1] =
1080             mv[1][0] = mv[1][1] = 0;
1081         }else{
1082             for(list=0; list<2; list++){
1083                 if(ref[list] >= 0)
1084                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1085                 else
1086                     mv[list][0] = mv[list][1] = 0;
1087             }
1088         }
1089
1090         if(ref[1] < 0){
1091             if(!is_b8x8)
1092                 *mb_type &= ~MB_TYPE_L1;
1093             sub_mb_type &= ~MB_TYPE_L1;
1094         }else if(ref[0] < 0){
1095             if(!is_b8x8)
1096                 *mb_type &= ~MB_TYPE_L0;
1097             sub_mb_type &= ~MB_TYPE_L0;
1098         }
1099
1100         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1101             for(i8=0; i8<4; i8++){
1102                 int x8 = i8&1;
1103                 int y8 = i8>>1;
1104                 int xy8 = x8+y8*b8_stride;
1105                 int xy4 = 3*x8+y8*b4_stride;
1106                 int a=0, b=0;
1107
1108                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1109                     continue;
1110                 h->sub_mb_type[i8] = sub_mb_type;
1111
1112                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1113                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1114                 if(!IS_INTRA(mb_type_col[y8])
1115                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1116                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1117                     if(ref[0] > 0)
1118                         a= pack16to32(mv[0][0],mv[0][1]);
1119                     if(ref[1] > 0)
1120                         b= pack16to32(mv[1][0],mv[1][1]);
1121                 }else{
1122                     a= pack16to32(mv[0][0],mv[0][1]);
1123                     b= pack16to32(mv[1][0],mv[1][1]);
1124                 }
1125                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1126                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1127             }
1128         }else if(IS_16X16(*mb_type)){
1129             int a=0, b=0;
1130
1131             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1132             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1133             if(!IS_INTRA(mb_type_col[0])
1134                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1135                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1136                        && (h->x264_build>33 || !h->x264_build)))){
1137                 if(ref[0] > 0)
1138                     a= pack16to32(mv[0][0],mv[0][1]);
1139                 if(ref[1] > 0)
1140                     b= pack16to32(mv[1][0],mv[1][1]);
1141             }else{
1142                 a= pack16to32(mv[0][0],mv[0][1]);
1143                 b= pack16to32(mv[1][0],mv[1][1]);
1144             }
1145             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1146             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1147         }else{
1148             for(i8=0; i8<4; i8++){
1149                 const int x8 = i8&1;
1150                 const int y8 = i8>>1;
1151
1152                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1153                     continue;
1154                 h->sub_mb_type[i8] = sub_mb_type;
1155
1156                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1157                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1158                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1159                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1160
1161                 /* col_zero_flag */
1162                 if(!IS_INTRA(mb_type_col[0]) && (   l1ref0[x8 + y8*b8_stride] == 0
1163                                               || (l1ref0[x8 + y8*b8_stride] < 0 && l1ref1[x8 + y8*b8_stride] == 0
1164                                                   && (h->x264_build>33 || !h->x264_build)))){
1165                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*b8_stride] == 0 ? l1mv0 : l1mv1;
1166                     if(IS_SUB_8X8(sub_mb_type)){
1167                         const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1168                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1169                             if(ref[0] == 0)
1170                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1171                             if(ref[1] == 0)
1172                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1173                         }
1174                     }else
1175                     for(i4=0; i4<4; i4++){
1176                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1177                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1178                             if(ref[0] == 0)
1179                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1180                             if(ref[1] == 0)
1181                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1182                         }
1183                     }
1184                 }
1185             }
1186         }
1187     }else{ /* direct temporal mv pred */
1188         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1189         const int *dist_scale_factor = h->dist_scale_factor;
1190         int ref_offset= 0;
1191
1192         if(FRAME_MBAFF && IS_INTERLACED(*mb_type)){
1193             map_col_to_list0[0] = h->map_col_to_list0_field[s->mb_y&1][0];
1194             map_col_to_list0[1] = h->map_col_to_list0_field[s->mb_y&1][1];
1195             dist_scale_factor   =h->dist_scale_factor_field[s->mb_y&1];
1196         }
1197         if(h->ref_list[1][0].mbaff && IS_INTERLACED(mb_type_col[0]))
1198             ref_offset += 16;
1199
1200         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col[0])){
1201             /* FIXME assumes direct_8x8_inference == 1 */
1202             int y_shift  = 2*!IS_INTERLACED(*mb_type);
1203
1204             for(i8=0; i8<4; i8++){
1205                 const int x8 = i8&1;
1206                 const int y8 = i8>>1;
1207                 int ref0, scale;
1208                 const int16_t (*l1mv)[2]= l1mv0;
1209
1210                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1211                     continue;
1212                 h->sub_mb_type[i8] = sub_mb_type;
1213
1214                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1215                 if(IS_INTRA(mb_type_col[y8])){
1216                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1217                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1218                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1219                     continue;
1220                 }
1221
1222                 ref0 = l1ref0[x8 + y8*b8_stride];
1223                 if(ref0 >= 0)
1224                     ref0 = map_col_to_list0[0][ref0 + ref_offset];
1225                 else{
1226                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1227                     l1mv= l1mv1;
1228                 }
1229                 scale = dist_scale_factor[ref0];
1230                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1231
1232                 {
1233                     const int16_t *mv_col = l1mv[x8*3 + y8*b4_stride];
1234                     int my_col = (mv_col[1]<<y_shift)/2;
1235                     int mx = (scale * mv_col[0] + 128) >> 8;
1236                     int my = (scale * my_col + 128) >> 8;
1237                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1238                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1239                 }
1240             }
1241             return;
1242         }
1243
1244         /* one-to-one mv scaling */
1245
1246         if(IS_16X16(*mb_type)){
1247             int ref, mv0, mv1;
1248
1249             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1250             if(IS_INTRA(mb_type_col[0])){
1251                 ref=mv0=mv1=0;
1252             }else{
1253                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0] + ref_offset]
1254                                                 : map_col_to_list0[1][l1ref1[0] + ref_offset];
1255                 const int scale = dist_scale_factor[ref0];
1256                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1257                 int mv_l0[2];
1258                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1259                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1260                 ref= ref0;
1261                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1262                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1263             }
1264             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1265             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1266             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1267         }else{
1268             for(i8=0; i8<4; i8++){
1269                 const int x8 = i8&1;
1270                 const int y8 = i8>>1;
1271                 int ref0, scale;
1272                 const int16_t (*l1mv)[2]= l1mv0;
1273
1274                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1275                     continue;
1276                 h->sub_mb_type[i8] = sub_mb_type;
1277                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1278                 if(IS_INTRA(mb_type_col[0])){
1279                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1280                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1281                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1282                     continue;
1283                 }
1284
1285                 ref0 = l1ref0[x8 + y8*b8_stride] + ref_offset;
1286                 if(ref0 >= 0)
1287                     ref0 = map_col_to_list0[0][ref0];
1288                 else{
1289                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*b8_stride] + ref_offset];
1290                     l1mv= l1mv1;
1291                 }
1292                 scale = dist_scale_factor[ref0];
1293
1294                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1295                 if(IS_SUB_8X8(sub_mb_type)){
1296                     const int16_t *mv_col = l1mv[x8*3 + y8*3*b4_stride];
1297                     int mx = (scale * mv_col[0] + 128) >> 8;
1298                     int my = (scale * mv_col[1] + 128) >> 8;
1299                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1300                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1301                 }else
1302                 for(i4=0; i4<4; i4++){
1303                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*b4_stride];
1304                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1305                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1306                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1307                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1308                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1309                 }
1310             }
1311         }
1312     }
1313 }
1314
1315 static inline void write_back_motion(H264Context *h, int mb_type){
1316     MpegEncContext * const s = &h->s;
1317     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1318     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1319     int list;
1320
1321     if(!USES_LIST(mb_type, 0))
1322         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1323
1324     for(list=0; list<h->list_count; list++){
1325         int y;
1326         if(!USES_LIST(mb_type, list))
1327             continue;
1328
1329         for(y=0; y<4; y++){
1330             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1331             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1332         }
1333         if( h->pps.cabac ) {
1334             if(IS_SKIP(mb_type))
1335                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1336             else
1337             for(y=0; y<4; y++){
1338                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1339                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1340             }
1341         }
1342
1343         {
1344             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1345             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1346             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1347             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1348             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1349         }
1350     }
1351
1352     if(h->slice_type_nos == FF_B_TYPE && h->pps.cabac){
1353         if(IS_8X8(mb_type)){
1354             uint8_t *direct_table = &h->direct_table[b8_xy];
1355             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1356             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1357             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1358         }
1359     }
1360 }
1361
1362 /**
1363  * Decodes a network abstraction layer unit.
1364  * @param consumed is the number of bytes used as input
1365  * @param length is the length of the array
1366  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1367  * @returns decoded bytes, might be src+1 if no escapes
1368  */
1369 static const uint8_t *decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1370     int i, si, di;
1371     uint8_t *dst;
1372     int bufidx;
1373
1374 //    src[0]&0x80;                //forbidden bit
1375     h->nal_ref_idc= src[0]>>5;
1376     h->nal_unit_type= src[0]&0x1F;
1377
1378     src++; length--;
1379 #if 0
1380     for(i=0; i<length; i++)
1381         printf("%2X ", src[i]);
1382 #endif
1383
1384 #if HAVE_FAST_UNALIGNED
1385 # if HAVE_FAST_64BIT
1386 #   define RS 7
1387     for(i=0; i+1<length; i+=9){
1388         if(!((~*(uint64_t*)(src+i) & (*(uint64_t*)(src+i) - 0x0100010001000101ULL)) & 0x8000800080008080ULL))
1389 # else
1390 #   define RS 3
1391     for(i=0; i+1<length; i+=5){
1392         if(!((~*(uint32_t*)(src+i) & (*(uint32_t*)(src+i) - 0x01000101U)) & 0x80008080U))
1393 # endif
1394             continue;
1395         if(i>0 && !src[i]) i--;
1396         while(src[i]) i++;
1397 #else
1398 #   define RS 0
1399     for(i=0; i+1<length; i+=2){
1400         if(src[i]) continue;
1401         if(i>0 && src[i-1]==0) i--;
1402 #endif
1403         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1404             if(src[i+2]!=3){
1405                 /* startcode, so we must be past the end */
1406                 length=i;
1407             }
1408             break;
1409         }
1410         i-= RS;
1411     }
1412
1413     if(i>=length-1){ //no escaped 0
1414         *dst_length= length;
1415         *consumed= length+1; //+1 for the header
1416         return src;
1417     }
1418
1419     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1420     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length+FF_INPUT_BUFFER_PADDING_SIZE);
1421     dst= h->rbsp_buffer[bufidx];
1422
1423     if (dst == NULL){
1424         return NULL;
1425     }
1426
1427 //printf("decoding esc\n");
1428     memcpy(dst, src, i);
1429     si=di=i;
1430     while(si+2<length){
1431         //remove escapes (very rare 1:2^22)
1432         if(src[si+2]>3){
1433             dst[di++]= src[si++];
1434             dst[di++]= src[si++];
1435         }else if(src[si]==0 && src[si+1]==0){
1436             if(src[si+2]==3){ //escape
1437                 dst[di++]= 0;
1438                 dst[di++]= 0;
1439                 si+=3;
1440                 continue;
1441             }else //next start code
1442                 goto nsc;
1443         }
1444
1445         dst[di++]= src[si++];
1446     }
1447     while(si<length)
1448         dst[di++]= src[si++];
1449 nsc:
1450
1451     memset(dst+di, 0, FF_INPUT_BUFFER_PADDING_SIZE);
1452
1453     *dst_length= di;
1454     *consumed= si + 1;//+1 for the header
1455 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1456     return dst;
1457 }
1458
1459 /**
1460  * identifies the exact end of the bitstream
1461  * @return the length of the trailing, or 0 if damaged
1462  */
1463 static int decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1464     int v= *src;
1465     int r;
1466
1467     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1468
1469     for(r=1; r<9; r++){
1470         if(v&1) return r;
1471         v>>=1;
1472     }
1473     return 0;
1474 }
1475
1476 /**
1477  * IDCT transforms the 16 dc values and dequantizes them.
1478  * @param qp quantization parameter
1479  */
1480 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1481 #define stride 16
1482     int i;
1483     int temp[16]; //FIXME check if this is a good idea
1484     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1485     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1486
1487 //memset(block, 64, 2*256);
1488 //return;
1489     for(i=0; i<4; i++){
1490         const int offset= y_offset[i];
1491         const int z0= block[offset+stride*0] + block[offset+stride*4];
1492         const int z1= block[offset+stride*0] - block[offset+stride*4];
1493         const int z2= block[offset+stride*1] - block[offset+stride*5];
1494         const int z3= block[offset+stride*1] + block[offset+stride*5];
1495
1496         temp[4*i+0]= z0+z3;
1497         temp[4*i+1]= z1+z2;
1498         temp[4*i+2]= z1-z2;
1499         temp[4*i+3]= z0-z3;
1500     }
1501
1502     for(i=0; i<4; i++){
1503         const int offset= x_offset[i];
1504         const int z0= temp[4*0+i] + temp[4*2+i];
1505         const int z1= temp[4*0+i] - temp[4*2+i];
1506         const int z2= temp[4*1+i] - temp[4*3+i];
1507         const int z3= temp[4*1+i] + temp[4*3+i];
1508
1509         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_residual
1510         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1511         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1512         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1513     }
1514 }
1515
1516 #if 0
1517 /**
1518  * DCT transforms the 16 dc values.
1519  * @param qp quantization parameter ??? FIXME
1520  */
1521 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1522 //    const int qmul= dequant_coeff[qp][0];
1523     int i;
1524     int temp[16]; //FIXME check if this is a good idea
1525     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1526     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1527
1528     for(i=0; i<4; i++){
1529         const int offset= y_offset[i];
1530         const int z0= block[offset+stride*0] + block[offset+stride*4];
1531         const int z1= block[offset+stride*0] - block[offset+stride*4];
1532         const int z2= block[offset+stride*1] - block[offset+stride*5];
1533         const int z3= block[offset+stride*1] + block[offset+stride*5];
1534
1535         temp[4*i+0]= z0+z3;
1536         temp[4*i+1]= z1+z2;
1537         temp[4*i+2]= z1-z2;
1538         temp[4*i+3]= z0-z3;
1539     }
1540
1541     for(i=0; i<4; i++){
1542         const int offset= x_offset[i];
1543         const int z0= temp[4*0+i] + temp[4*2+i];
1544         const int z1= temp[4*0+i] - temp[4*2+i];
1545         const int z2= temp[4*1+i] - temp[4*3+i];
1546         const int z3= temp[4*1+i] + temp[4*3+i];
1547
1548         block[stride*0 +offset]= (z0 + z3)>>1;
1549         block[stride*2 +offset]= (z1 + z2)>>1;
1550         block[stride*8 +offset]= (z1 - z2)>>1;
1551         block[stride*10+offset]= (z0 - z3)>>1;
1552     }
1553 }
1554 #endif
1555
1556 #undef xStride
1557 #undef stride
1558
1559 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1560     const int stride= 16*2;
1561     const int xStride= 16;
1562     int a,b,c,d,e;
1563
1564     a= block[stride*0 + xStride*0];
1565     b= block[stride*0 + xStride*1];
1566     c= block[stride*1 + xStride*0];
1567     d= block[stride*1 + xStride*1];
1568
1569     e= a-b;
1570     a= a+b;
1571     b= c-d;
1572     c= c+d;
1573
1574     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1575     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1576     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1577     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1578 }
1579
1580 #if 0
1581 static void chroma_dc_dct_c(DCTELEM *block){
1582     const int stride= 16*2;
1583     const int xStride= 16;
1584     int a,b,c,d,e;
1585
1586     a= block[stride*0 + xStride*0];
1587     b= block[stride*0 + xStride*1];
1588     c= block[stride*1 + xStride*0];
1589     d= block[stride*1 + xStride*1];
1590
1591     e= a-b;
1592     a= a+b;
1593     b= c-d;
1594     c= c+d;
1595
1596     block[stride*0 + xStride*0]= (a+c);
1597     block[stride*0 + xStride*1]= (e+b);
1598     block[stride*1 + xStride*0]= (a-c);
1599     block[stride*1 + xStride*1]= (e-b);
1600 }
1601 #endif
1602
1603 /**
1604  * gets the chroma qp.
1605  */
1606 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1607     return h->pps.chroma_qp_table[t][qscale];
1608 }
1609
1610 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1611                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1612                            int src_x_offset, int src_y_offset,
1613                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1614     MpegEncContext * const s = &h->s;
1615     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1616     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1617     const int luma_xy= (mx&3) + ((my&3)<<2);
1618     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1619     uint8_t * src_cb, * src_cr;
1620     int extra_width= h->emu_edge_width;
1621     int extra_height= h->emu_edge_height;
1622     int emu=0;
1623     const int full_mx= mx>>2;
1624     const int full_my= my>>2;
1625     const int pic_width  = 16*s->mb_width;
1626     const int pic_height = 16*s->mb_height >> MB_FIELD;
1627
1628     if(mx&7) extra_width -= 3;
1629     if(my&7) extra_height -= 3;
1630
1631     if(   full_mx < 0-extra_width
1632        || full_my < 0-extra_height
1633        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1634        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1635         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1636             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1637         emu=1;
1638     }
1639
1640     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1641     if(!square){
1642         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1643     }
1644
1645     if(CONFIG_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1646
1647     if(MB_FIELD){
1648         // chroma offset when predicting from a field of opposite parity
1649         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1650         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1651     }
1652     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1653     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1654
1655     if(emu){
1656         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1657             src_cb= s->edge_emu_buffer;
1658     }
1659     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1660
1661     if(emu){
1662         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1663             src_cr= s->edge_emu_buffer;
1664     }
1665     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1666 }
1667
1668 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1669                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1670                            int x_offset, int y_offset,
1671                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1672                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1673                            int list0, int list1){
1674     MpegEncContext * const s = &h->s;
1675     qpel_mc_func *qpix_op=  qpix_put;
1676     h264_chroma_mc_func chroma_op= chroma_put;
1677
1678     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1679     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1680     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1681     x_offset += 8*s->mb_x;
1682     y_offset += 8*(s->mb_y >> MB_FIELD);
1683
1684     if(list0){
1685         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1686         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1687                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1688                            qpix_op, chroma_op);
1689
1690         qpix_op=  qpix_avg;
1691         chroma_op= chroma_avg;
1692     }
1693
1694     if(list1){
1695         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1696         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1697                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1698                            qpix_op, chroma_op);
1699     }
1700 }
1701
1702 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1703                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1704                            int x_offset, int y_offset,
1705                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1706                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1707                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1708                            int list0, int list1){
1709     MpegEncContext * const s = &h->s;
1710
1711     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1712     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1713     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1714     x_offset += 8*s->mb_x;
1715     y_offset += 8*(s->mb_y >> MB_FIELD);
1716
1717     if(list0 && list1){
1718         /* don't optimize for luma-only case, since B-frames usually
1719          * use implicit weights => chroma too. */
1720         uint8_t *tmp_cb = s->obmc_scratchpad;
1721         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1722         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1723         int refn0 = h->ref_cache[0][ scan8[n] ];
1724         int refn1 = h->ref_cache[1][ scan8[n] ];
1725
1726         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1727                     dest_y, dest_cb, dest_cr,
1728                     x_offset, y_offset, qpix_put, chroma_put);
1729         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1730                     tmp_y, tmp_cb, tmp_cr,
1731                     x_offset, y_offset, qpix_put, chroma_put);
1732
1733         if(h->use_weight == 2){
1734             int weight0 = h->implicit_weight[refn0][refn1];
1735             int weight1 = 64 - weight0;
1736             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1737             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1738             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1739         }else{
1740             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1741                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1742                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1743             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1744                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1745                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1746             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1747                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1748                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1749         }
1750     }else{
1751         int list = list1 ? 1 : 0;
1752         int refn = h->ref_cache[list][ scan8[n] ];
1753         Picture *ref= &h->ref_list[list][refn];
1754         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1755                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1756                     qpix_put, chroma_put);
1757
1758         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1759                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1760         if(h->use_weight_chroma){
1761             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1762                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1763             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1764                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1765         }
1766     }
1767 }
1768
1769 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1770                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1771                            int x_offset, int y_offset,
1772                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1773                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1774                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1775                            int list0, int list1){
1776     if((h->use_weight==2 && list0 && list1
1777         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1778        || h->use_weight==1)
1779         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1780                          x_offset, y_offset, qpix_put, chroma_put,
1781                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1782     else
1783         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1784                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1785 }
1786
1787 static inline void prefetch_motion(H264Context *h, int list){
1788     /* fetch pixels for estimated mv 4 macroblocks ahead
1789      * optimized for 64byte cache lines */
1790     MpegEncContext * const s = &h->s;
1791     const int refn = h->ref_cache[list][scan8[0]];
1792     if(refn >= 0){
1793         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1794         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1795         uint8_t **src= h->ref_list[list][refn].data;
1796         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1797         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1798         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1799         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1800     }
1801 }
1802
1803 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1804                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1805                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1806                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1807     MpegEncContext * const s = &h->s;
1808     const int mb_xy= h->mb_xy;
1809     const int mb_type= s->current_picture.mb_type[mb_xy];
1810
1811     assert(IS_INTER(mb_type));
1812
1813     prefetch_motion(h, 0);
1814
1815     if(IS_16X16(mb_type)){
1816         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1817                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1818                 &weight_op[0], &weight_avg[0],
1819                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1820     }else if(IS_16X8(mb_type)){
1821         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1822                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1823                 &weight_op[1], &weight_avg[1],
1824                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1825         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1826                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1827                 &weight_op[1], &weight_avg[1],
1828                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1829     }else if(IS_8X16(mb_type)){
1830         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1831                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1832                 &weight_op[2], &weight_avg[2],
1833                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1834         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1835                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1836                 &weight_op[2], &weight_avg[2],
1837                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1838     }else{
1839         int i;
1840
1841         assert(IS_8X8(mb_type));
1842
1843         for(i=0; i<4; i++){
1844             const int sub_mb_type= h->sub_mb_type[i];
1845             const int n= 4*i;
1846             int x_offset= (i&1)<<2;
1847             int y_offset= (i&2)<<1;
1848
1849             if(IS_SUB_8X8(sub_mb_type)){
1850                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1851                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1852                     &weight_op[3], &weight_avg[3],
1853                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1854             }else if(IS_SUB_8X4(sub_mb_type)){
1855                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1856                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1857                     &weight_op[4], &weight_avg[4],
1858                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1859                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1860                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1861                     &weight_op[4], &weight_avg[4],
1862                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1863             }else if(IS_SUB_4X8(sub_mb_type)){
1864                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1865                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1866                     &weight_op[5], &weight_avg[5],
1867                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1868                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1869                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1870                     &weight_op[5], &weight_avg[5],
1871                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1872             }else{
1873                 int j;
1874                 assert(IS_SUB_4X4(sub_mb_type));
1875                 for(j=0; j<4; j++){
1876                     int sub_x_offset= x_offset + 2*(j&1);
1877                     int sub_y_offset= y_offset +   (j&2);
1878                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1879                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1880                         &weight_op[6], &weight_avg[6],
1881                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1882                 }
1883             }
1884         }
1885     }
1886
1887     prefetch_motion(h, 1);
1888 }
1889
1890 static av_cold void init_cavlc_level_tab(void){
1891     int suffix_length, mask;
1892     unsigned int i;
1893
1894     for(suffix_length=0; suffix_length<7; suffix_length++){
1895         for(i=0; i<(1<<LEVEL_TAB_BITS); i++){
1896             int prefix= LEVEL_TAB_BITS - av_log2(2*i);
1897             int level_code= (prefix<<suffix_length) + (i>>(LEVEL_TAB_BITS-prefix-1-suffix_length)) - (1<<suffix_length);
1898
1899             mask= -(level_code&1);
1900             level_code= (((2+level_code)>>1) ^ mask) - mask;
1901             if(prefix + 1 + suffix_length <= LEVEL_TAB_BITS){
1902                 cavlc_level_tab[suffix_length][i][0]= level_code;
1903                 cavlc_level_tab[suffix_length][i][1]= prefix + 1 + suffix_length;
1904             }else if(prefix + 1 <= LEVEL_TAB_BITS){
1905                 cavlc_level_tab[suffix_length][i][0]= prefix+100;
1906                 cavlc_level_tab[suffix_length][i][1]= prefix + 1;
1907             }else{
1908                 cavlc_level_tab[suffix_length][i][0]= LEVEL_TAB_BITS+100;
1909                 cavlc_level_tab[suffix_length][i][1]= LEVEL_TAB_BITS;
1910             }
1911         }
1912     }
1913 }
1914
1915 static av_cold void decode_init_vlc(void){
1916     static int done = 0;
1917
1918     if (!done) {
1919         int i;
1920         int offset;
1921         done = 1;
1922
1923         chroma_dc_coeff_token_vlc.table = chroma_dc_coeff_token_vlc_table;
1924         chroma_dc_coeff_token_vlc.table_allocated = chroma_dc_coeff_token_vlc_table_size;
1925         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1926                  &chroma_dc_coeff_token_len [0], 1, 1,
1927                  &chroma_dc_coeff_token_bits[0], 1, 1,
1928                  INIT_VLC_USE_NEW_STATIC);
1929
1930         offset = 0;
1931         for(i=0; i<4; i++){
1932             coeff_token_vlc[i].table = coeff_token_vlc_tables+offset;
1933             coeff_token_vlc[i].table_allocated = coeff_token_vlc_tables_size[i];
1934             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1935                      &coeff_token_len [i][0], 1, 1,
1936                      &coeff_token_bits[i][0], 1, 1,
1937                      INIT_VLC_USE_NEW_STATIC);
1938             offset += coeff_token_vlc_tables_size[i];
1939         }
1940         /*
1941          * This is a one time safety check to make sure that
1942          * the packed static coeff_token_vlc table sizes
1943          * were initialized correctly.
1944          */
1945         assert(offset == FF_ARRAY_ELEMS(coeff_token_vlc_tables));
1946
1947         for(i=0; i<3; i++){
1948             chroma_dc_total_zeros_vlc[i].table = chroma_dc_total_zeros_vlc_tables[i];
1949             chroma_dc_total_zeros_vlc[i].table_allocated = chroma_dc_total_zeros_vlc_tables_size;
1950             init_vlc(&chroma_dc_total_zeros_vlc[i],
1951                      CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1952                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1953                      &chroma_dc_total_zeros_bits[i][0], 1, 1,
1954                      INIT_VLC_USE_NEW_STATIC);
1955         }
1956         for(i=0; i<15; i++){
1957             total_zeros_vlc[i].table = total_zeros_vlc_tables[i];
1958             total_zeros_vlc[i].table_allocated = total_zeros_vlc_tables_size;
1959             init_vlc(&total_zeros_vlc[i],
1960                      TOTAL_ZEROS_VLC_BITS, 16,
1961                      &total_zeros_len [i][0], 1, 1,
1962                      &total_zeros_bits[i][0], 1, 1,
1963                      INIT_VLC_USE_NEW_STATIC);
1964         }
1965
1966         for(i=0; i<6; i++){
1967             run_vlc[i].table = run_vlc_tables[i];
1968             run_vlc[i].table_allocated = run_vlc_tables_size;
1969             init_vlc(&run_vlc[i],
1970                      RUN_VLC_BITS, 7,
1971                      &run_len [i][0], 1, 1,
1972                      &run_bits[i][0], 1, 1,
1973                      INIT_VLC_USE_NEW_STATIC);
1974         }
1975         run7_vlc.table = run7_vlc_table,
1976         run7_vlc.table_allocated = run7_vlc_table_size;
1977         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
1978                  &run_len [6][0], 1, 1,
1979                  &run_bits[6][0], 1, 1,
1980                  INIT_VLC_USE_NEW_STATIC);
1981
1982         init_cavlc_level_tab();
1983     }
1984 }
1985
1986 static void free_tables(H264Context *h){
1987     int i;
1988     H264Context *hx;
1989     av_freep(&h->intra4x4_pred_mode);
1990     av_freep(&h->chroma_pred_mode_table);
1991     av_freep(&h->cbp_table);
1992     av_freep(&h->mvd_table[0]);
1993     av_freep(&h->mvd_table[1]);
1994     av_freep(&h->direct_table);
1995     av_freep(&h->non_zero_count);
1996     av_freep(&h->slice_table_base);
1997     h->slice_table= NULL;
1998
1999     av_freep(&h->mb2b_xy);
2000     av_freep(&h->mb2b8_xy);
2001
2002     for(i = 0; i < h->s.avctx->thread_count; i++) {
2003         hx = h->thread_context[i];
2004         if(!hx) continue;
2005         av_freep(&hx->top_borders[1]);
2006         av_freep(&hx->top_borders[0]);
2007         av_freep(&hx->s.obmc_scratchpad);
2008     }
2009 }
2010
2011 static void init_dequant8_coeff_table(H264Context *h){
2012     int i,q,x;
2013     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2014     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2015     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2016
2017     for(i=0; i<2; i++ ){
2018         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2019             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2020             break;
2021         }
2022
2023         for(q=0; q<52; q++){
2024             int shift = div6[q];
2025             int idx = rem6[q];
2026             for(x=0; x<64; x++)
2027                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2028                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2029                     h->pps.scaling_matrix8[i][x]) << shift;
2030         }
2031     }
2032 }
2033
2034 static void init_dequant4_coeff_table(H264Context *h){
2035     int i,j,q,x;
2036     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2037     for(i=0; i<6; i++ ){
2038         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2039         for(j=0; j<i; j++){
2040             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2041                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2042                 break;
2043             }
2044         }
2045         if(j<i)
2046             continue;
2047
2048         for(q=0; q<52; q++){
2049             int shift = div6[q] + 2;
2050             int idx = rem6[q];
2051             for(x=0; x<16; x++)
2052                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2053                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2054                     h->pps.scaling_matrix4[i][x]) << shift;
2055         }
2056     }
2057 }
2058
2059 static void init_dequant_tables(H264Context *h){
2060     int i,x;
2061     init_dequant4_coeff_table(h);
2062     if(h->pps.transform_8x8_mode)
2063         init_dequant8_coeff_table(h);
2064     if(h->sps.transform_bypass){
2065         for(i=0; i<6; i++)
2066             for(x=0; x<16; x++)
2067                 h->dequant4_coeff[i][0][x] = 1<<6;
2068         if(h->pps.transform_8x8_mode)
2069             for(i=0; i<2; i++)
2070                 for(x=0; x<64; x++)
2071                     h->dequant8_coeff[i][0][x] = 1<<6;
2072     }
2073 }
2074
2075
2076 /**
2077  * allocates tables.
2078  * needs width/height
2079  */
2080 static int alloc_tables(H264Context *h){
2081     MpegEncContext * const s = &h->s;
2082     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2083     int x,y;
2084
2085     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2086
2087     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2088     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(*h->slice_table_base))
2089     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2090
2091     CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2092     CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2093     CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2094     CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2095
2096     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(*h->slice_table_base));
2097     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2098
2099     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2100     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2101     for(y=0; y<s->mb_height; y++){
2102         for(x=0; x<s->mb_width; x++){
2103             const int mb_xy= x + y*s->mb_stride;
2104             const int b_xy = 4*x + 4*y*h->b_stride;
2105             const int b8_xy= 2*x + 2*y*h->b8_stride;
2106
2107             h->mb2b_xy [mb_xy]= b_xy;
2108             h->mb2b8_xy[mb_xy]= b8_xy;
2109         }
2110     }
2111
2112     s->obmc_scratchpad = NULL;
2113
2114     if(!h->dequant4_coeff[0])
2115         init_dequant_tables(h);
2116
2117     return 0;
2118 fail:
2119     free_tables(h);
2120     return -1;
2121 }
2122
2123 /**
2124  * Mimic alloc_tables(), but for every context thread.
2125  */
2126 static void clone_tables(H264Context *dst, H264Context *src){
2127     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2128     dst->non_zero_count           = src->non_zero_count;
2129     dst->slice_table              = src->slice_table;
2130     dst->cbp_table                = src->cbp_table;
2131     dst->mb2b_xy                  = src->mb2b_xy;
2132     dst->mb2b8_xy                 = src->mb2b8_xy;
2133     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2134     dst->mvd_table[0]             = src->mvd_table[0];
2135     dst->mvd_table[1]             = src->mvd_table[1];
2136     dst->direct_table             = src->direct_table;
2137
2138     dst->s.obmc_scratchpad = NULL;
2139     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2140 }
2141
2142 /**
2143  * Init context
2144  * Allocate buffers which are not shared amongst multiple threads.
2145  */
2146 static int context_init(H264Context *h){
2147     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2148     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2149
2150     return 0;
2151 fail:
2152     return -1; // free_tables will clean up for us
2153 }
2154
2155 static av_cold void common_init(H264Context *h){
2156     MpegEncContext * const s = &h->s;
2157
2158     s->width = s->avctx->width;
2159     s->height = s->avctx->height;
2160     s->codec_id= s->avctx->codec->id;
2161
2162     ff_h264_pred_init(&h->hpc, s->codec_id);
2163
2164     h->dequant_coeff_pps= -1;
2165     s->unrestricted_mv=1;
2166     s->decode=1; //FIXME
2167
2168     dsputil_init(&s->dsp, s->avctx); // needed so that idct permutation is known early
2169
2170     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2171     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2172 }
2173
2174 static av_cold int decode_init(AVCodecContext *avctx){
2175     H264Context *h= avctx->priv_data;
2176     MpegEncContext * const s = &h->s;
2177
2178     MPV_decode_defaults(s);
2179
2180     s->avctx = avctx;
2181     common_init(h);
2182
2183     s->out_format = FMT_H264;
2184     s->workaround_bugs= avctx->workaround_bugs;
2185
2186     // set defaults
2187 //    s->decode_mb= ff_h263_decode_mb;
2188     s->quarter_sample = 1;
2189     s->low_delay= 1;
2190
2191     if(avctx->codec_id == CODEC_ID_SVQ3)
2192         avctx->pix_fmt= PIX_FMT_YUVJ420P;
2193     else if(s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
2194         avctx->pix_fmt= PIX_FMT_VDPAU_H264;
2195     else
2196         avctx->pix_fmt= PIX_FMT_YUV420P;
2197
2198     decode_init_vlc();
2199
2200     if(avctx->extradata_size > 0 && avctx->extradata &&
2201        *(char *)avctx->extradata == 1){
2202         h->is_avc = 1;
2203         h->got_avcC = 0;
2204     } else {
2205         h->is_avc = 0;
2206     }
2207
2208     h->thread_context[0] = h;
2209     h->outputed_poc = INT_MIN;
2210     h->prev_poc_msb= 1<<16;
2211     return 0;
2212 }
2213
2214 static int frame_start(H264Context *h){
2215     MpegEncContext * const s = &h->s;
2216     int i;
2217
2218     if(MPV_frame_start(s, s->avctx) < 0)
2219         return -1;
2220     ff_er_frame_start(s);
2221     /*
2222      * MPV_frame_start uses pict_type to derive key_frame.
2223      * This is incorrect for H.264; IDR markings must be used.
2224      * Zero here; IDR markings per slice in frame or fields are ORed in later.
2225      * See decode_nal_units().
2226      */
2227     s->current_picture_ptr->key_frame= 0;
2228
2229     assert(s->linesize && s->uvlinesize);
2230
2231     for(i=0; i<16; i++){
2232         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2233         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2234     }
2235     for(i=0; i<4; i++){
2236         h->block_offset[16+i]=
2237         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2238         h->block_offset[24+16+i]=
2239         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2240     }
2241
2242     /* can't be in alloc_tables because linesize isn't known there.
2243      * FIXME: redo bipred weight to not require extra buffer? */
2244     for(i = 0; i < s->avctx->thread_count; i++)
2245         if(!h->thread_context[i]->s.obmc_scratchpad)
2246             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2247
2248     /* some macroblocks will be accessed before they're available */
2249     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2250         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(*h->slice_table));
2251
2252 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2253
2254     // We mark the current picture as non-reference after allocating it, so
2255     // that if we break out due to an error it can be released automatically
2256     // in the next MPV_frame_start().
2257     // SVQ3 as well as most other codecs have only last/next/current and thus
2258     // get released even with set reference, besides SVQ3 and others do not
2259     // mark frames as reference later "naturally".
2260     if(s->codec_id != CODEC_ID_SVQ3)
2261         s->current_picture_ptr->reference= 0;
2262
2263     s->current_picture_ptr->field_poc[0]=
2264     s->current_picture_ptr->field_poc[1]= INT_MAX;
2265     assert(s->current_picture_ptr->long_ref==0);
2266
2267     return 0;
2268 }
2269
2270 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2271     MpegEncContext * const s = &h->s;
2272     int i;
2273     int step    = 1;
2274     int offset  = 1;
2275     int uvoffset= 1;
2276     int top_idx = 1;
2277     int skiplast= 0;
2278
2279     src_y  -=   linesize;
2280     src_cb -= uvlinesize;
2281     src_cr -= uvlinesize;
2282
2283     if(!simple && FRAME_MBAFF){
2284         if(s->mb_y&1){
2285             offset  = MB_MBAFF ? 1 : 17;
2286             uvoffset= MB_MBAFF ? 1 : 9;
2287             if(!MB_MBAFF){
2288                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 0)= *(uint64_t*)(src_y +  15*linesize);
2289                 *(uint64_t*)(h->top_borders[0][s->mb_x]+ 8)= *(uint64_t*)(src_y +8+15*linesize);
2290                 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2291                     *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+7*uvlinesize);
2292                     *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+7*uvlinesize);
2293                 }
2294             }
2295         }else{
2296             if(!MB_MBAFF){
2297                 h->left_border[0]= h->top_borders[0][s->mb_x][15];
2298                 if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2299                     h->left_border[34   ]= h->top_borders[0][s->mb_x][16+7  ];
2300                     h->left_border[34+18]= h->top_borders[0][s->mb_x][16+8+7];
2301                 }
2302                 skiplast= 1;
2303             }
2304             offset  =
2305             uvoffset=
2306             top_idx = MB_MBAFF ? 0 : 1;
2307         }
2308         step= MB_MBAFF ? 2 : 1;
2309     }
2310
2311     // There are two lines saved, the line above the the top macroblock of a pair,
2312     // and the line above the bottom macroblock
2313     h->left_border[offset]= h->top_borders[top_idx][s->mb_x][15];
2314     for(i=1; i<17 - skiplast; i++){
2315         h->left_border[offset+i*step]= src_y[15+i*  linesize];
2316     }
2317
2318     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2319     *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2320
2321     if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2322         h->left_border[uvoffset+34   ]= h->top_borders[top_idx][s->mb_x][16+7];
2323         h->left_border[uvoffset+34+18]= h->top_borders[top_idx][s->mb_x][24+7];
2324         for(i=1; i<9 - skiplast; i++){
2325             h->left_border[uvoffset+34   +i*step]= src_cb[7+i*uvlinesize];
2326             h->left_border[uvoffset+34+18+i*step]= src_cr[7+i*uvlinesize];
2327         }
2328         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2329         *(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2330     }
2331 }
2332
2333 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2334     MpegEncContext * const s = &h->s;
2335     int temp8, i;
2336     uint64_t temp64;
2337     int deblock_left;
2338     int deblock_top;
2339     int mb_xy;
2340     int step    = 1;
2341     int offset  = 1;
2342     int uvoffset= 1;
2343     int top_idx = 1;
2344
2345     if(!simple && FRAME_MBAFF){
2346         if(s->mb_y&1){
2347             offset  = MB_MBAFF ? 1 : 17;
2348             uvoffset= MB_MBAFF ? 1 : 9;
2349         }else{
2350             offset  =
2351             uvoffset=
2352             top_idx = MB_MBAFF ? 0 : 1;
2353         }
2354         step= MB_MBAFF ? 2 : 1;
2355     }
2356
2357     if(h->deblocking_filter == 2) {
2358         mb_xy = h->mb_xy;
2359         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2360         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2361     } else {
2362         deblock_left = (s->mb_x > 0);
2363         deblock_top =  (s->mb_y > !!MB_FIELD);
2364     }
2365
2366     src_y  -=   linesize + 1;
2367     src_cb -= uvlinesize + 1;
2368     src_cr -= uvlinesize + 1;
2369
2370 #define XCHG(a,b,t,xchg)\
2371 t= a;\
2372 if(xchg)\
2373     a= b;\
2374 b= t;
2375
2376     if(deblock_left){
2377         for(i = !deblock_top; i<16; i++){
2378             XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, xchg);
2379         }
2380         XCHG(h->left_border[offset+i*step], src_y [i*  linesize], temp8, 1);
2381     }
2382
2383     if(deblock_top){
2384         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2385         XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2386         if(s->mb_x+1 < s->mb_width){
2387             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2388         }
2389     }
2390
2391     if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2392         if(deblock_left){
2393             for(i = !deblock_top; i<8; i++){
2394                 XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, xchg);
2395                 XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, xchg);
2396             }
2397             XCHG(h->left_border[uvoffset+34   +i*step], src_cb[i*uvlinesize], temp8, 1);
2398             XCHG(h->left_border[uvoffset+34+18+i*step], src_cr[i*uvlinesize], temp8, 1);
2399         }
2400         if(deblock_top){
2401             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2402             XCHG(*(uint64_t*)(h->top_borders[top_idx][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2403         }
2404     }
2405 }
2406
2407 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2408     MpegEncContext * const s = &h->s;
2409     const int mb_x= s->mb_x;
2410     const int mb_y= s->mb_y;
2411     const int mb_xy= h->mb_xy;
2412     const int mb_type= s->current_picture.mb_type[mb_xy];
2413     uint8_t  *dest_y, *dest_cb, *dest_cr;
2414     int linesize, uvlinesize /*dct_offset*/;
2415     int i;
2416     int *block_offset = &h->block_offset[0];
2417     const int transform_bypass = !simple && (s->qscale == 0 && h->sps.transform_bypass);
2418     /* is_h264 should always be true if SVQ3 is disabled. */
2419     const int is_h264 = !CONFIG_SVQ3_DECODER || simple || s->codec_id == CODEC_ID_H264;
2420     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2421     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2422
2423     dest_y  = s->current_picture.data[0] + (mb_x + mb_y * s->linesize  ) * 16;
2424     dest_cb = s->current_picture.data[1] + (mb_x + mb_y * s->uvlinesize) * 8;
2425     dest_cr = s->current_picture.data[2] + (mb_x + mb_y * s->uvlinesize) * 8;
2426
2427     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2428     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2429
2430     if (!simple && MB_FIELD) {
2431         linesize   = h->mb_linesize   = s->linesize * 2;
2432         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2433         block_offset = &h->block_offset[24];
2434         if(mb_y&1){ //FIXME move out of this function?
2435             dest_y -= s->linesize*15;
2436             dest_cb-= s->uvlinesize*7;
2437             dest_cr-= s->uvlinesize*7;
2438         }
2439         if(FRAME_MBAFF) {
2440             int list;
2441             for(list=0; list<h->list_count; list++){
2442                 if(!USES_LIST(mb_type, list))
2443                     continue;
2444                 if(IS_16X16(mb_type)){
2445                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2446                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2447                 }else{
2448                     for(i=0; i<16; i+=4){
2449                         int ref = h->ref_cache[list][scan8[i]];
2450                         if(ref >= 0)
2451                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2452                     }
2453                 }
2454             }
2455         }
2456     } else {
2457         linesize   = h->mb_linesize   = s->linesize;
2458         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2459 //        dct_offset = s->linesize * 16;
2460     }
2461
2462     if (!simple && IS_INTRA_PCM(mb_type)) {
2463         for (i=0; i<16; i++) {
2464             memcpy(dest_y + i*  linesize, h->mb       + i*8, 16);
2465         }
2466         for (i=0; i<8; i++) {
2467             memcpy(dest_cb+ i*uvlinesize, h->mb + 128 + i*4,  8);
2468             memcpy(dest_cr+ i*uvlinesize, h->mb + 160 + i*4,  8);
2469         }
2470     } else {
2471         if(IS_INTRA(mb_type)){
2472             if(h->deblocking_filter)
2473                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2474
2475             if(simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2476                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2477                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2478             }
2479
2480             if(IS_INTRA4x4(mb_type)){
2481                 if(simple || !s->encoding){
2482                     if(IS_8x8DCT(mb_type)){
2483                         if(transform_bypass){
2484                             idct_dc_add =
2485                             idct_add    = s->dsp.add_pixels8;
2486                         }else{
2487                             idct_dc_add = s->dsp.h264_idct8_dc_add;
2488                             idct_add    = s->dsp.h264_idct8_add;
2489                         }
2490                         for(i=0; i<16; i+=4){
2491                             uint8_t * const ptr= dest_y + block_offset[i];
2492                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2493                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2494                                 h->hpc.pred8x8l_add[dir](ptr, h->mb + i*16, linesize);
2495                             }else{
2496                                 const int nnz = h->non_zero_count_cache[ scan8[i] ];
2497                                 h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2498                                                             (h->topright_samples_available<<i)&0x4000, linesize);
2499                                 if(nnz){
2500                                     if(nnz == 1 && h->mb[i*16])
2501                                         idct_dc_add(ptr, h->mb + i*16, linesize);
2502                                     else
2503                                         idct_add   (ptr, h->mb + i*16, linesize);
2504                                 }
2505                             }
2506                         }
2507                     }else{
2508                         if(transform_bypass){
2509                             idct_dc_add =
2510                             idct_add    = s->dsp.add_pixels4;
2511                         }else{
2512                             idct_dc_add = s->dsp.h264_idct_dc_add;
2513                             idct_add    = s->dsp.h264_idct_add;
2514                         }
2515                         for(i=0; i<16; i++){
2516                             uint8_t * const ptr= dest_y + block_offset[i];
2517                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2518
2519                             if(transform_bypass && h->sps.profile_idc==244 && dir<=1){
2520                                 h->hpc.pred4x4_add[dir](ptr, h->mb + i*16, linesize);
2521                             }else{
2522                                 uint8_t *topright;
2523                                 int nnz, tr;
2524                                 if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2525                                     const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2526                                     assert(mb_y || linesize <= block_offset[i]);
2527                                     if(!topright_avail){
2528                                         tr= ptr[3 - linesize]*0x01010101;
2529                                         topright= (uint8_t*) &tr;
2530                                     }else
2531                                         topright= ptr + 4 - linesize;
2532                                 }else
2533                                     topright= NULL;
2534
2535                                 h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2536                                 nnz = h->non_zero_count_cache[ scan8[i] ];
2537                                 if(nnz){
2538                                     if(is_h264){
2539                                         if(nnz == 1 && h->mb[i*16])
2540                                             idct_dc_add(ptr, h->mb + i*16, linesize);
2541                                         else
2542                                             idct_add   (ptr, h->mb + i*16, linesize);
2543                                     }else
2544                                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2545                                 }
2546                             }
2547                         }
2548                     }
2549                 }
2550             }else{
2551                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2552                 if(is_h264){
2553                     if(!transform_bypass)
2554                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2555                 }else
2556                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2557             }
2558             if(h->deblocking_filter)
2559                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2560         }else if(is_h264){
2561             hl_motion(h, dest_y, dest_cb, dest_cr,
2562                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2563                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2564                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2565         }
2566
2567
2568         if(!IS_INTRA4x4(mb_type)){
2569             if(is_h264){
2570                 if(IS_INTRA16x16(mb_type)){
2571                     if(transform_bypass){
2572                         if(h->sps.profile_idc==244 && (h->intra16x16_pred_mode==VERT_PRED8x8 || h->intra16x16_pred_mode==HOR_PRED8x8)){
2573                             h->hpc.pred16x16_add[h->intra16x16_pred_mode](dest_y, block_offset, h->mb, linesize);
2574                         }else{
2575                             for(i=0; i<16; i++){
2576                                 if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2577                                     s->dsp.add_pixels4(dest_y + block_offset[i], h->mb + i*16, linesize);
2578                             }
2579                         }
2580                     }else{
2581                          s->dsp.h264_idct_add16intra(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2582                     }
2583                 }else if(h->cbp&15){
2584                     if(transform_bypass){
2585                         const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2586                         idct_add= IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2587                         for(i=0; i<16; i+=di){
2588                             if(h->non_zero_count_cache[ scan8[i] ]){
2589                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2590                             }
2591                         }
2592                     }else{
2593                         if(IS_8x8DCT(mb_type)){
2594                             s->dsp.h264_idct8_add4(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2595                         }else{
2596                             s->dsp.h264_idct_add16(dest_y, block_offset, h->mb, linesize, h->non_zero_count_cache);
2597                         }
2598                     }
2599                 }
2600             }else{
2601                 for(i=0; i<16; i++){
2602                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2603                         uint8_t * const ptr= dest_y + block_offset[i];
2604                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2605                     }
2606                 }
2607             }
2608         }
2609
2610         if((simple || !CONFIG_GRAY || !(s->flags&CODEC_FLAG_GRAY)) && (h->cbp&0x30)){
2611             uint8_t *dest[2] = {dest_cb, dest_cr};
2612             if(transform_bypass){
2613                 if(IS_INTRA(mb_type) && h->sps.profile_idc==244 && (h->chroma_pred_mode==VERT_PRED8x8 || h->chroma_pred_mode==HOR_PRED8x8)){
2614                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[0], block_offset + 16, h->mb + 16*16, uvlinesize);
2615                     h->hpc.pred8x8_add[h->chroma_pred_mode](dest[1], block_offset + 20, h->mb + 20*16, uvlinesize);
2616                 }else{
2617                     idct_add = s->dsp.add_pixels4;
2618                     for(i=16; i<16+8; i++){
2619                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16])
2620                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2621                     }
2622                 }
2623             }else{
2624                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2625                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2626                 if(is_h264){
2627                     idct_add = s->dsp.h264_idct_add;
2628                     idct_dc_add = s->dsp.h264_idct_dc_add;
2629                     for(i=16; i<16+8; i++){
2630                         if(h->non_zero_count_cache[ scan8[i] ])
2631                             idct_add   (dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2632                         else if(h->mb[i*16])
2633                             idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2634                     }
2635                 }else{
2636                     for(i=16; i<16+8; i++){
2637                         if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2638                             uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2639                             svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2640                         }
2641                     }
2642                 }
2643             }
2644         }
2645     }
2646     if(h->cbp || IS_INTRA(mb_type))
2647         s->dsp.clear_blocks(h->mb);
2648
2649     if(h->deblocking_filter) {
2650         backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2651         fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2652         h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2653         h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2654         if (!simple && FRAME_MBAFF) {
2655             filter_mb     (h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2656         } else {
2657             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2658         }
2659     }
2660 }
2661
2662 /**
2663  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2664  */
2665 static void hl_decode_mb_simple(H264Context *h){
2666     hl_decode_mb_internal(h, 1);
2667 }
2668
2669 /**
2670  * Process a macroblock; this handles edge cases, such as interlacing.
2671  */
2672 static void av_noinline hl_decode_mb_complex(H264Context *h){
2673     hl_decode_mb_internal(h, 0);
2674 }
2675
2676 static void hl_decode_mb(H264Context *h){
2677     MpegEncContext * const s = &h->s;
2678     const int mb_xy= h->mb_xy;
2679     const int mb_type= s->current_picture.mb_type[mb_xy];
2680     int is_complex = CONFIG_SMALL || h->is_complex || IS_INTRA_PCM(mb_type) || s->qscale == 0;
2681
2682     if (is_complex)
2683         hl_decode_mb_complex(h);
2684     else hl_decode_mb_simple(h);
2685 }
2686
2687 static void pic_as_field(Picture *pic, const int parity){
2688     int i;
2689     for (i = 0; i < 4; ++i) {
2690         if (parity == PICT_BOTTOM_FIELD)
2691             pic->data[i] += pic->linesize[i];
2692         pic->reference = parity;
2693         pic->linesize[i] *= 2;
2694     }
2695     pic->poc= pic->field_poc[parity == PICT_BOTTOM_FIELD];
2696 }
2697
2698 static int split_field_copy(Picture *dest, Picture *src,
2699                             int parity, int id_add){
2700     int match = !!(src->reference & parity);
2701
2702     if (match) {
2703         *dest = *src;
2704         if(parity != PICT_FRAME){
2705             pic_as_field(dest, parity);
2706             dest->pic_id *= 2;
2707             dest->pic_id += id_add;
2708         }
2709     }
2710
2711     return match;
2712 }
2713
2714 static int build_def_list(Picture *def, Picture **in, int len, int is_long, int sel){
2715     int i[2]={0};
2716     int index=0;
2717
2718     while(i[0]<len || i[1]<len){
2719         while(i[0]<len && !(in[ i[0] ] && (in[ i[0] ]->reference & sel)))
2720             i[0]++;
2721         while(i[1]<len && !(in[ i[1] ] && (in[ i[1] ]->reference & (sel^3))))
2722             i[1]++;
2723         if(i[0] < len){
2724             in[ i[0] ]->pic_id= is_long ? i[0] : in[ i[0] ]->frame_num;
2725             split_field_copy(&def[index++], in[ i[0]++ ], sel  , 1);
2726         }
2727         if(i[1] < len){
2728             in[ i[1] ]->pic_id= is_long ? i[1] : in[ i[1] ]->frame_num;
2729             split_field_copy(&def[index++], in[ i[1]++ ], sel^3, 0);
2730         }
2731     }
2732
2733     return index;
2734 }
2735
2736 static int add_sorted(Picture **sorted, Picture **src, int len, int limit, int dir){
2737     int i, best_poc;
2738     int out_i= 0;
2739
2740     for(;;){
2741         best_poc= dir ? INT_MIN : INT_MAX;
2742
2743         for(i=0; i<len; i++){
2744             const int poc= src[i]->poc;
2745             if(((poc > limit) ^ dir) && ((poc < best_poc) ^ dir)){
2746                 best_poc= poc;
2747                 sorted[out_i]= src[i];
2748             }
2749         }
2750         if(best_poc == (dir ? INT_MIN : INT_MAX))
2751             break;
2752         limit= sorted[out_i++]->poc - dir;
2753     }
2754     return out_i;
2755 }
2756
2757 /**
2758  * fills the default_ref_list.
2759  */
2760 static int fill_default_ref_list(H264Context *h){
2761     MpegEncContext * const s = &h->s;
2762     int i, len;
2763
2764     if(h->slice_type_nos==FF_B_TYPE){
2765         Picture *sorted[32];
2766         int cur_poc, list;
2767         int lens[2];
2768
2769         if(FIELD_PICTURE)
2770             cur_poc= s->current_picture_ptr->field_poc[ s->picture_structure == PICT_BOTTOM_FIELD ];
2771         else
2772             cur_poc= s->current_picture_ptr->poc;
2773
2774         for(list= 0; list<2; list++){
2775             len= add_sorted(sorted    , h->short_ref, h->short_ref_count, cur_poc, 1^list);
2776             len+=add_sorted(sorted+len, h->short_ref, h->short_ref_count, cur_poc, 0^list);
2777             assert(len<=32);
2778             len= build_def_list(h->default_ref_list[list]    , sorted     , len, 0, s->picture_structure);
2779             len+=build_def_list(h->default_ref_list[list]+len, h->long_ref, 16 , 1, s->picture_structure);
2780             assert(len<=32);
2781
2782             if(len < h->ref_count[list])
2783                 memset(&h->default_ref_list[list][len], 0, sizeof(Picture)*(h->ref_count[list] - len));
2784             lens[list]= len;
2785         }
2786
2787         if(lens[0] == lens[1] && lens[1] > 1){
2788             for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0] && i<lens[0]; i++);
2789             if(i == lens[0])
2790                 FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2791         }
2792     }else{
2793         len = build_def_list(h->default_ref_list[0]    , h->short_ref, h->short_ref_count, 0, s->picture_structure);
2794         len+= build_def_list(h->default_ref_list[0]+len, h-> long_ref, 16                , 1, s->picture_structure);
2795         assert(len <= 32);
2796         if(len < h->ref_count[0])
2797             memset(&h->default_ref_list[0][len], 0, sizeof(Picture)*(h->ref_count[0] - len));
2798     }
2799 #ifdef TRACE
2800     for (i=0; i<h->ref_count[0]; i++) {
2801         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2802     }
2803     if(h->slice_type_nos==FF_B_TYPE){
2804         for (i=0; i<h->ref_count[1]; i++) {
2805             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
2806         }
2807     }
2808 #endif
2809     return 0;
2810 }
2811
2812 static void print_short_term(H264Context *h);
2813 static void print_long_term(H264Context *h);
2814
2815 /**
2816  * Extract structure information about the picture described by pic_num in
2817  * the current decoding context (frame or field). Note that pic_num is
2818  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
2819  * @param pic_num picture number for which to extract structure information
2820  * @param structure one of PICT_XXX describing structure of picture
2821  *                      with pic_num
2822  * @return frame number (short term) or long term index of picture
2823  *         described by pic_num
2824  */
2825 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
2826     MpegEncContext * const s = &h->s;
2827
2828     *structure = s->picture_structure;
2829     if(FIELD_PICTURE){
2830         if (!(pic_num & 1))
2831             /* opposite field */
2832             *structure ^= PICT_FRAME;
2833         pic_num >>= 1;
2834     }
2835
2836     return pic_num;
2837 }
2838
2839 static int decode_ref_pic_list_reordering(H264Context *h){
2840     MpegEncContext * const s = &h->s;
2841     int list, index, pic_structure;
2842
2843     print_short_term(h);
2844     print_long_term(h);
2845
2846     for(list=0; list<h->list_count; list++){
2847         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
2848
2849         if(get_bits1(&s->gb)){
2850             int pred= h->curr_pic_num;
2851
2852             for(index=0; ; index++){
2853                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb_31(&s->gb);
2854                 unsigned int pic_id;
2855                 int i;
2856                 Picture *ref = NULL;
2857
2858                 if(reordering_of_pic_nums_idc==3)
2859                     break;
2860
2861                 if(index >= h->ref_count[list]){
2862                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
2863                     return -1;
2864                 }
2865
2866                 if(reordering_of_pic_nums_idc<3){
2867                     if(reordering_of_pic_nums_idc<2){
2868                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
2869                         int frame_num;
2870
2871                         if(abs_diff_pic_num > h->max_pic_num){
2872                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
2873                             return -1;
2874                         }
2875
2876                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
2877                         else                                pred+= abs_diff_pic_num;
2878                         pred &= h->max_pic_num - 1;
2879
2880                         frame_num = pic_num_extract(h, pred, &pic_structure);
2881
2882                         for(i= h->short_ref_count-1; i>=0; i--){
2883                             ref = h->short_ref[i];
2884                             assert(ref->reference);
2885                             assert(!ref->long_ref);
2886                             if(
2887                                    ref->frame_num == frame_num &&
2888                                    (ref->reference & pic_structure)
2889                               )
2890                                 break;
2891                         }
2892                         if(i>=0)
2893                             ref->pic_id= pred;
2894                     }else{
2895                         int long_idx;
2896                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
2897
2898                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
2899
2900                         if(long_idx>31){
2901                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
2902                             return -1;
2903                         }
2904                         ref = h->long_ref[long_idx];
2905                         assert(!(ref && !ref->reference));
2906                         if(ref && (ref->reference & pic_structure)){
2907                             ref->pic_id= pic_id;
2908                             assert(ref->long_ref);
2909                             i=0;
2910                         }else{
2911                             i=-1;
2912                         }
2913                     }
2914
2915                     if (i < 0) {
2916                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
2917                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
2918                     } else {
2919                         for(i=index; i+1<h->ref_count[list]; i++){
2920                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
2921                                 break;
2922                         }
2923                         for(; i > index; i--){
2924                             h->ref_list[list][i]= h->ref_list[list][i-1];
2925                         }
2926                         h->ref_list[list][index]= *ref;
2927                         if (FIELD_PICTURE){
2928                             pic_as_field(&h->ref_list[list][index], pic_structure);
2929                         }
2930                     }
2931                 }else{
2932                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
2933                     return -1;
2934                 }
2935             }
2936         }
2937     }
2938     for(list=0; list<h->list_count; list++){
2939         for(index= 0; index < h->ref_count[list]; index++){
2940             if(!h->ref_list[list][index].data[0]){
2941                 av_log(h->s.avctx, AV_LOG_ERROR, "Missing reference picture\n");
2942                 h->ref_list[list][index]= s->current_picture; //FIXME this is not a sensible solution
2943             }
2944         }
2945     }
2946
2947     return 0;
2948 }
2949
2950 static void fill_mbaff_ref_list(H264Context *h){
2951     int list, i, j;
2952     for(list=0; list<2; list++){ //FIXME try list_count
2953         for(i=0; i<h->ref_count[list]; i++){
2954             Picture *frame = &h->ref_list[list][i];
2955             Picture *field = &h->ref_list[list][16+2*i];
2956             field[0] = *frame;
2957             for(j=0; j<3; j++)
2958                 field[0].linesize[j] <<= 1;
2959             field[0].reference = PICT_TOP_FIELD;
2960             field[0].poc= field[0].field_poc[0];
2961             field[1] = field[0];
2962             for(j=0; j<3; j++)
2963                 field[1].data[j] += frame->linesize[j];
2964             field[1].reference = PICT_BOTTOM_FIELD;
2965             field[1].poc= field[1].field_poc[1];
2966
2967             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
2968             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
2969             for(j=0; j<2; j++){
2970                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
2971                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
2972             }
2973         }
2974     }
2975     for(j=0; j<h->ref_count[1]; j++){
2976         for(i=0; i<h->ref_count[0]; i++)
2977             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
2978         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
2979         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
2980     }
2981 }
2982
2983 static int pred_weight_table(H264Context *h){
2984     MpegEncContext * const s = &h->s;
2985     int list, i;
2986     int luma_def, chroma_def;
2987
2988     h->use_weight= 0;
2989     h->use_weight_chroma= 0;
2990     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
2991     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
2992     luma_def = 1<<h->luma_log2_weight_denom;
2993     chroma_def = 1<<h->chroma_log2_weight_denom;
2994
2995     for(list=0; list<2; list++){
2996         h->luma_weight_flag[list]   = 0;
2997         h->chroma_weight_flag[list] = 0;
2998         for(i=0; i<h->ref_count[list]; i++){
2999             int luma_weight_flag, chroma_weight_flag;
3000
3001             luma_weight_flag= get_bits1(&s->gb);
3002             if(luma_weight_flag){
3003                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3004                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3005                 if(   h->luma_weight[list][i] != luma_def
3006                    || h->luma_offset[list][i] != 0) {
3007                     h->use_weight= 1;
3008                     h->luma_weight_flag[list]= 1;
3009                 }
3010             }else{
3011                 h->luma_weight[list][i]= luma_def;
3012                 h->luma_offset[list][i]= 0;
3013             }
3014
3015             if(CHROMA){
3016                 chroma_weight_flag= get_bits1(&s->gb);
3017                 if(chroma_weight_flag){
3018                     int j;
3019                     for(j=0; j<2; j++){
3020                         h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3021                         h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3022                         if(   h->chroma_weight[list][i][j] != chroma_def
3023                            || h->chroma_offset[list][i][j] != 0) {
3024                             h->use_weight_chroma= 1;
3025                             h->chroma_weight_flag[list]= 1;
3026                         }
3027                     }
3028                 }else{
3029                     int j;
3030                     for(j=0; j<2; j++){
3031                         h->chroma_weight[list][i][j]= chroma_def;
3032                         h->chroma_offset[list][i][j]= 0;
3033                     }
3034                 }
3035             }
3036         }
3037         if(h->slice_type_nos != FF_B_TYPE) break;
3038     }
3039     h->use_weight= h->use_weight || h->use_weight_chroma;
3040     return 0;
3041 }
3042
3043 static void implicit_weight_table(H264Context *h){
3044     MpegEncContext * const s = &h->s;
3045     int ref0, ref1, i;
3046     int cur_poc = s->current_picture_ptr->poc;
3047
3048     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3049        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3050         h->use_weight= 0;
3051         h->use_weight_chroma= 0;
3052         return;
3053     }
3054
3055     h->use_weight= 2;
3056     h->use_weight_chroma= 2;
3057     h->luma_log2_weight_denom= 5;
3058     h->chroma_log2_weight_denom= 5;
3059     for (i = 0; i < 2; i++) {
3060         h->luma_weight_flag[i]   = 0;
3061         h->chroma_weight_flag[i] = 0;
3062     }
3063
3064     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3065         int poc0 = h->ref_list[0][ref0].poc;
3066         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3067             int poc1 = h->ref_list[1][ref1].poc;
3068             int td = av_clip(poc1 - poc0, -128, 127);
3069             if(td){
3070                 int tb = av_clip(cur_poc - poc0, -128, 127);
3071                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3072                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3073                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3074                     h->implicit_weight[ref0][ref1] = 32;
3075                 else
3076                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3077             }else
3078                 h->implicit_weight[ref0][ref1] = 32;
3079         }
3080     }
3081 }
3082
3083 /**
3084  * Mark a picture as no longer needed for reference. The refmask
3085  * argument allows unreferencing of individual fields or the whole frame.
3086  * If the picture becomes entirely unreferenced, but is being held for
3087  * display purposes, it is marked as such.
3088  * @param refmask mask of fields to unreference; the mask is bitwise
3089  *                anded with the reference marking of pic
3090  * @return non-zero if pic becomes entirely unreferenced (except possibly
3091  *         for display purposes) zero if one of the fields remains in
3092  *         reference
3093  */
3094 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3095     int i;
3096     if (pic->reference &= refmask) {
3097         return 0;
3098     } else {
3099         for(i = 0; h->delayed_pic[i]; i++)
3100             if(pic == h->delayed_pic[i]){
3101                 pic->reference=DELAYED_PIC_REF;
3102                 break;
3103             }
3104         return 1;
3105     }
3106 }
3107
3108 /**
3109  * instantaneous decoder refresh.
3110  */
3111 static void idr(H264Context *h){
3112     int i;
3113
3114     for(i=0; i<16; i++){
3115         remove_long(h, i, 0);
3116     }
3117     assert(h->long_ref_count==0);
3118
3119     for(i=0; i<h->short_ref_count; i++){
3120         unreference_pic(h, h->short_ref[i], 0);
3121         h->short_ref[i]= NULL;
3122     }
3123     h->short_ref_count=0;
3124     h->prev_frame_num= 0;
3125     h->prev_frame_num_offset= 0;
3126     h->prev_poc_msb=
3127     h->prev_poc_lsb= 0;
3128 }
3129
3130 /* forget old pics after a seek */
3131 static void flush_dpb(AVCodecContext *avctx){
3132     H264Context *h= avctx->priv_data;
3133     int i;
3134     for(i=0; i<MAX_DELAYED_PIC_COUNT; i++) {
3135         if(h->delayed_pic[i])
3136             h->delayed_pic[i]->reference= 0;
3137         h->delayed_pic[i]= NULL;
3138     }
3139     h->outputed_poc= INT_MIN;
3140     idr(h);
3141     if(h->s.current_picture_ptr)
3142         h->s.current_picture_ptr->reference= 0;
3143     h->s.first_field= 0;
3144     ff_mpeg_flush(avctx);
3145 }
3146
3147 /**
3148  * Find a Picture in the short term reference list by frame number.
3149  * @param frame_num frame number to search for
3150  * @param idx the index into h->short_ref where returned picture is found
3151  *            undefined if no picture found.
3152  * @return pointer to the found picture, or NULL if no pic with the provided
3153  *                 frame number is found
3154  */
3155 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3156     MpegEncContext * const s = &h->s;
3157     int i;
3158
3159     for(i=0; i<h->short_ref_count; i++){
3160         Picture *pic= h->short_ref[i];
3161         if(s->avctx->debug&FF_DEBUG_MMCO)
3162             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3163         if(pic->frame_num == frame_num) {
3164             *idx = i;
3165             return pic;
3166         }
3167     }
3168     return NULL;
3169 }
3170
3171 /**
3172  * Remove a picture from the short term reference list by its index in
3173  * that list.  This does no checking on the provided index; it is assumed
3174  * to be valid. Other list entries are shifted down.
3175  * @param i index into h->short_ref of picture to remove.
3176  */
3177 static void remove_short_at_index(H264Context *h, int i){
3178     assert(i >= 0 && i < h->short_ref_count);
3179     h->short_ref[i]= NULL;
3180     if (--h->short_ref_count)
3181         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3182 }
3183
3184 /**
3185  *
3186  * @return the removed picture or NULL if an error occurs
3187  */
3188 static Picture * remove_short(H264Context *h, int frame_num, int ref_mask){
3189     MpegEncContext * const s = &h->s;
3190     Picture *pic;
3191     int i;
3192
3193     if(s->avctx->debug&FF_DEBUG_MMCO)
3194         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3195
3196     pic = find_short(h, frame_num, &i);
3197     if (pic){
3198         if(unreference_pic(h, pic, ref_mask))
3199         remove_short_at_index(h, i);
3200     }
3201
3202     return pic;
3203 }
3204
3205 /**
3206  * Remove a picture from the long term reference list by its index in
3207  * that list.
3208  * @return the removed picture or NULL if an error occurs
3209  */
3210 static Picture * remove_long(H264Context *h, int i, int ref_mask){
3211     Picture *pic;
3212
3213     pic= h->long_ref[i];
3214     if (pic){
3215         if(unreference_pic(h, pic, ref_mask)){
3216             assert(h->long_ref[i]->long_ref == 1);
3217             h->long_ref[i]->long_ref= 0;
3218             h->long_ref[i]= NULL;
3219             h->long_ref_count--;
3220         }
3221     }
3222
3223     return pic;
3224 }
3225
3226 /**
3227  * print short term list
3228  */
3229 static void print_short_term(H264Context *h) {
3230     uint32_t i;
3231     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3232         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3233         for(i=0; i<h->short_ref_count; i++){
3234             Picture *pic= h->short_ref[i];
3235             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3236         }
3237     }
3238 }
3239
3240 /**
3241  * print long term list
3242  */
3243 static void print_long_term(H264Context *h) {
3244     uint32_t i;
3245     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3246         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3247         for(i = 0; i < 16; i++){
3248             Picture *pic= h->long_ref[i];
3249             if (pic) {
3250                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3251             }
3252         }
3253     }
3254 }
3255
3256 /**
3257  * Executes the reference picture marking (memory management control operations).
3258  */
3259 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3260     MpegEncContext * const s = &h->s;
3261     int i, j;
3262     int current_ref_assigned=0;
3263     Picture *pic;
3264
3265     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3266         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3267
3268     for(i=0; i<mmco_count; i++){
3269         int structure, frame_num;
3270         if(s->avctx->debug&FF_DEBUG_MMCO)
3271             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3272
3273         if(   mmco[i].opcode == MMCO_SHORT2UNUSED
3274            || mmco[i].opcode == MMCO_SHORT2LONG){
3275             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3276             pic = find_short(h, frame_num, &j);
3277             if(!pic){
3278                 if(mmco[i].opcode != MMCO_SHORT2LONG || !h->long_ref[mmco[i].long_arg]
3279                    || h->long_ref[mmco[i].long_arg]->frame_num != frame_num)
3280                 av_log(h->s.avctx, AV_LOG_ERROR, "mmco: unref short failure\n");
3281                 continue;
3282             }
3283         }
3284
3285         switch(mmco[i].opcode){
3286         case MMCO_SHORT2UNUSED:
3287             if(s->avctx->debug&FF_DEBUG_MMCO)
3288                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3289             remove_short(h, frame_num, structure ^ PICT_FRAME);
3290             break;
3291         case MMCO_SHORT2LONG:
3292                 if (h->long_ref[mmco[i].long_arg] != pic)
3293                     remove_long(h, mmco[i].long_arg, 0);
3294
3295                 remove_short_at_index(h, j);
3296                 h->long_ref[ mmco[i].long_arg ]= pic;
3297                 if (h->long_ref[ mmco[i].long_arg ]){
3298                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3299                     h->long_ref_count++;
3300                 }
3301             break;
3302         case MMCO_LONG2UNUSED:
3303             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3304             pic = h->long_ref[j];
3305             if (pic) {
3306                 remove_long(h, j, structure ^ PICT_FRAME);
3307             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3308                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3309             break;
3310         case MMCO_LONG:
3311                     // Comment below left from previous code as it is an interresting note.
3312                     /* First field in pair is in short term list or
3313                      * at a different long term index.
3314                      * This is not allowed; see 7.4.3.3, notes 2 and 3.
3315                      * Report the problem and keep the pair where it is,
3316                      * and mark this field valid.
3317                      */
3318
3319             if (h->long_ref[mmco[i].long_arg] != s->current_picture_ptr) {
3320                 remove_long(h, mmco[i].long_arg, 0);
3321
3322                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3323                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3324                 h->long_ref_count++;
3325             }
3326
3327             s->current_picture_ptr->reference |= s->picture_structure;
3328             current_ref_assigned=1;
3329             break;
3330         case MMCO_SET_MAX_LONG:
3331             assert(mmco[i].long_arg <= 16);
3332             // just remove the long term which index is greater than new max
3333             for(j = mmco[i].long_arg; j<16; j++){
3334                 remove_long(h, j, 0);
3335             }
3336             break;
3337         case MMCO_RESET:
3338             while(h->short_ref_count){
3339                 remove_short(h, h->short_ref[0]->frame_num, 0);
3340             }
3341             for(j = 0; j < 16; j++) {
3342                 remove_long(h, j, 0);
3343             }
3344             s->current_picture_ptr->poc=
3345             s->current_picture_ptr->field_poc[0]=
3346             s->current_picture_ptr->field_poc[1]=
3347             h->poc_lsb=
3348             h->poc_msb=
3349             h->frame_num=
3350             s->current_picture_ptr->frame_num= 0;
3351             break;
3352         default: assert(0);
3353         }
3354     }
3355
3356     if (!current_ref_assigned) {
3357         /* Second field of complementary field pair; the first field of
3358          * which is already referenced. If short referenced, it
3359          * should be first entry in short_ref. If not, it must exist
3360          * in long_ref; trying to put it on the short list here is an
3361          * error in the encoded bit stream (ref: 7.4.3.3, NOTE 2 and 3).
3362          */
3363         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3364             /* Just mark the second field valid */
3365             s->current_picture_ptr->reference = PICT_FRAME;
3366         } else if (s->current_picture_ptr->long_ref) {
3367             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3368                                              "assignment for second field "
3369                                              "in complementary field pair "
3370                                              "(first field is long term)\n");
3371         } else {
3372             pic= remove_short(h, s->current_picture_ptr->frame_num, 0);
3373             if(pic){
3374                 av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3375             }
3376
3377             if(h->short_ref_count)
3378                 memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3379
3380             h->short_ref[0]= s->current_picture_ptr;
3381             h->short_ref_count++;
3382             s->current_picture_ptr->reference |= s->picture_structure;
3383         }
3384     }
3385
3386     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3387
3388         /* We have too many reference frames, probably due to corrupted
3389          * stream. Need to discard one frame. Prevents overrun of the
3390          * short_ref and long_ref buffers.
3391          */
3392         av_log(h->s.avctx, AV_LOG_ERROR,
3393                "number of reference frames exceeds max (probably "
3394                "corrupt input), discarding one\n");
3395
3396         if (h->long_ref_count && !h->short_ref_count) {
3397             for (i = 0; i < 16; ++i)
3398                 if (h->long_ref[i])
3399                     break;
3400
3401             assert(i < 16);
3402             remove_long(h, i, 0);
3403         } else {
3404             pic = h->short_ref[h->short_ref_count - 1];
3405             remove_short(h, pic->frame_num, 0);
3406         }
3407     }
3408
3409     print_short_term(h);
3410     print_long_term(h);
3411     return 0;
3412 }
3413
3414 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3415     MpegEncContext * const s = &h->s;
3416     int i;
3417
3418     h->mmco_index= 0;
3419     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3420         s->broken_link= get_bits1(gb) -1;
3421         if(get_bits1(gb)){
3422             h->mmco[0].opcode= MMCO_LONG;
3423             h->mmco[0].long_arg= 0;
3424             h->mmco_index= 1;
3425         }
3426     }else{
3427         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3428             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3429                 MMCOOpcode opcode= get_ue_golomb_31(gb);
3430
3431                 h->mmco[i].opcode= opcode;
3432                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3433                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3434 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3435                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3436                         return -1;
3437                     }*/
3438                 }
3439                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3440                     unsigned int long_arg= get_ue_golomb_31(gb);
3441                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3442                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3443                         return -1;
3444                     }
3445                     h->mmco[i].long_arg= long_arg;
3446                 }
3447
3448                 if(opcode > (unsigned)MMCO_LONG){
3449                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3450                     return -1;
3451                 }
3452                 if(opcode == MMCO_END)
3453                     break;
3454             }
3455             h->mmco_index= i;
3456         }else{
3457             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3458
3459             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3460                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3461                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3462                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3463                 h->mmco_index= 1;
3464                 if (FIELD_PICTURE) {
3465                     h->mmco[0].short_pic_num *= 2;
3466                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3467                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3468                     h->mmco_index= 2;
3469                 }
3470             }
3471         }
3472     }
3473
3474     return 0;
3475 }
3476
3477 static int init_poc(H264Context *h){
3478     MpegEncContext * const s = &h->s;
3479     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3480     int field_poc[2];
3481     Picture *cur = s->current_picture_ptr;
3482
3483     h->frame_num_offset= h->prev_frame_num_offset;
3484     if(h->frame_num < h->prev_frame_num)
3485         h->frame_num_offset += max_frame_num;
3486
3487     if(h->sps.poc_type==0){
3488         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3489
3490         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3491             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3492         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3493             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3494         else
3495             h->poc_msb = h->prev_poc_msb;
3496 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3497         field_poc[0] =
3498         field_poc[1] = h->poc_msb + h->poc_lsb;
3499         if(s->picture_structure == PICT_FRAME)
3500             field_poc[1] += h->delta_poc_bottom;
3501     }else if(h->sps.poc_type==1){
3502         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3503         int i;
3504
3505         if(h->sps.poc_cycle_length != 0)
3506             abs_frame_num = h->frame_num_offset + h->frame_num;
3507         else
3508             abs_frame_num = 0;
3509
3510         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3511             abs_frame_num--;
3512
3513         expected_delta_per_poc_cycle = 0;
3514         for(i=0; i < h->sps.poc_cycle_length; i++)
3515             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3516
3517         if(abs_frame_num > 0){
3518             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3519             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3520
3521             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3522             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3523                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3524         } else
3525             expectedpoc = 0;
3526
3527         if(h->nal_ref_idc == 0)
3528             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3529
3530         field_poc[0] = expectedpoc + h->delta_poc[0];
3531         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3532
3533         if(s->picture_structure == PICT_FRAME)
3534             field_poc[1] += h->delta_poc[1];
3535     }else{
3536         int poc= 2*(h->frame_num_offset + h->frame_num);
3537
3538         if(!h->nal_ref_idc)
3539             poc--;
3540
3541         field_poc[0]= poc;
3542         field_poc[1]= poc;
3543     }
3544
3545     if(s->picture_structure != PICT_BOTTOM_FIELD)
3546         s->current_picture_ptr->field_poc[0]= field_poc[0];
3547     if(s->picture_structure != PICT_TOP_FIELD)
3548         s->current_picture_ptr->field_poc[1]= field_poc[1];
3549     cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3550
3551     return 0;
3552 }
3553
3554
3555 /**
3556  * initialize scan tables
3557  */
3558 static void init_scan_tables(H264Context *h){
3559     MpegEncContext * const s = &h->s;
3560     int i;
3561     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3562         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3563         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3564     }else{
3565         for(i=0; i<16; i++){
3566 #define T(x) (x>>2) | ((x<<2) & 0xF)
3567             h->zigzag_scan[i] = T(zigzag_scan[i]);
3568             h-> field_scan[i] = T( field_scan[i]);
3569 #undef T
3570         }
3571     }
3572     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3573         memcpy(h->zigzag_scan8x8,       ff_zigzag_direct,     64*sizeof(uint8_t));
3574         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3575         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3576         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3577     }else{
3578         for(i=0; i<64; i++){
3579 #define T(x) (x>>3) | ((x&7)<<3)
3580             h->zigzag_scan8x8[i]       = T(ff_zigzag_direct[i]);
3581             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3582             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3583             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3584 #undef T
3585         }
3586     }
3587     if(h->sps.transform_bypass){ //FIXME same ugly
3588         h->zigzag_scan_q0          = zigzag_scan;
3589         h->zigzag_scan8x8_q0       = ff_zigzag_direct;
3590         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3591         h->field_scan_q0           = field_scan;
3592         h->field_scan8x8_q0        = field_scan8x8;
3593         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3594     }else{
3595         h->zigzag_scan_q0          = h->zigzag_scan;
3596         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3597         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3598         h->field_scan_q0           = h->field_scan;
3599         h->field_scan8x8_q0        = h->field_scan8x8;
3600         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3601     }
3602 }
3603
3604 /**
3605  * Replicates H264 "master" context to thread contexts.
3606  */
3607 static void clone_slice(H264Context *dst, H264Context *src)
3608 {
3609     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3610     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3611     dst->s.current_picture      = src->s.current_picture;
3612     dst->s.linesize             = src->s.linesize;
3613     dst->s.uvlinesize           = src->s.uvlinesize;
3614     dst->s.first_field          = src->s.first_field;
3615
3616     dst->prev_poc_msb           = src->prev_poc_msb;
3617     dst->prev_poc_lsb           = src->prev_poc_lsb;
3618     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3619     dst->prev_frame_num         = src->prev_frame_num;
3620     dst->short_ref_count        = src->short_ref_count;
3621
3622     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3623     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3624     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3625     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3626
3627     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3628     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3629 }
3630
3631 /**
3632  * decodes a slice header.
3633  * This will also call MPV_common_init() and frame_start() as needed.
3634  *
3635  * @param h h264context
3636  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3637  *
3638  * @return 0 if okay, <0 if an error occurred, 1 if decoding must not be multithreaded
3639  */
3640 static int decode_slice_header(H264Context *h, H264Context *h0){
3641     MpegEncContext * const s = &h->s;
3642     MpegEncContext * const s0 = &h0->s;
3643     unsigned int first_mb_in_slice;
3644     unsigned int pps_id;
3645     int num_ref_idx_active_override_flag;
3646     unsigned int slice_type, tmp, i, j;
3647     int default_ref_list_done = 0;
3648     int last_pic_structure;
3649
3650     s->dropable= h->nal_ref_idc == 0;
3651
3652     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3653         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3654         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3655     }else{
3656         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3657         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3658     }
3659
3660     first_mb_in_slice= get_ue_golomb(&s->gb);
3661
3662     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3663         h0->current_slice = 0;
3664         if (!s0->first_field)
3665             s->current_picture_ptr= NULL;
3666     }
3667
3668     slice_type= get_ue_golomb_31(&s->gb);
3669     if(slice_type > 9){
3670         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3671         return -1;
3672     }
3673     if(slice_type > 4){
3674         slice_type -= 5;
3675         h->slice_type_fixed=1;
3676     }else
3677         h->slice_type_fixed=0;
3678
3679     slice_type= golomb_to_pict_type[ slice_type ];
3680     if (slice_type == FF_I_TYPE
3681         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3682         default_ref_list_done = 1;
3683     }
3684     h->slice_type= slice_type;
3685     h->slice_type_nos= slice_type & 3;
3686
3687     s->pict_type= h->slice_type; // to make a few old functions happy, it's wrong though
3688     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3689         av_log(h->s.avctx, AV_LOG_ERROR,
3690                "B picture before any references, skipping\n");
3691         return -1;
3692     }
3693
3694     pps_id= get_ue_golomb(&s->gb);
3695     if(pps_id>=MAX_PPS_COUNT){
3696         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3697         return -1;
3698     }
3699     if(!h0->pps_buffers[pps_id]) {
3700         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing PPS referenced\n");
3701         return -1;
3702     }
3703     h->pps= *h0->pps_buffers[pps_id];
3704
3705     if(!h0->sps_buffers[h->pps.sps_id]) {
3706         av_log(h->s.avctx, AV_LOG_ERROR, "non-existing SPS referenced\n");
3707         return -1;
3708     }
3709     h->sps = *h0->sps_buffers[h->pps.sps_id];
3710
3711     if(h == h0 && h->dequant_coeff_pps != pps_id){
3712         h->dequant_coeff_pps = pps_id;
3713         init_dequant_tables(h);
3714     }
3715
3716     s->mb_width= h->sps.mb_width;
3717     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3718
3719     h->b_stride=  s->mb_width*4;
3720     h->b8_stride= s->mb_width*2;
3721
3722     s->width = 16*s->mb_width - 2*FFMIN(h->sps.crop_right, 7);
3723     if(h->sps.frame_mbs_only_flag)
3724         s->height= 16*s->mb_height - 2*FFMIN(h->sps.crop_bottom, 7);
3725     else
3726         s->height= 16*s->mb_height - 4*FFMIN(h->sps.crop_bottom, 3);
3727
3728     if (s->context_initialized
3729         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3730         if(h != h0)
3731             return -1;   // width / height changed during parallelized decoding
3732         free_tables(h);
3733         flush_dpb(s->avctx);
3734         MPV_common_end(s);
3735     }
3736     if (!s->context_initialized) {
3737         if(h != h0)
3738             return -1;  // we cant (re-)initialize context during parallel decoding
3739         if (MPV_common_init(s) < 0)
3740             return -1;
3741         s->first_field = 0;
3742
3743         init_scan_tables(h);
3744         alloc_tables(h);
3745
3746         for(i = 1; i < s->avctx->thread_count; i++) {
3747             H264Context *c;
3748             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3749             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3750             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3751             c->sps = h->sps;
3752             c->pps = h->pps;
3753             init_scan_tables(c);
3754             clone_tables(c, h);
3755         }
3756
3757         for(i = 0; i < s->avctx->thread_count; i++)
3758             if(context_init(h->thread_context[i]) < 0)
3759                 return -1;
3760
3761         s->avctx->width = s->width;
3762         s->avctx->height = s->height;
3763         s->avctx->sample_aspect_ratio= h->sps.sar;
3764         if(!s->avctx->sample_aspect_ratio.den)
3765             s->avctx->sample_aspect_ratio.den = 1;
3766
3767         if(h->sps.timing_info_present_flag){
3768             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
3769             if(h->x264_build > 0 && h->x264_build < 44)
3770                 s->avctx->time_base.den *= 2;
3771             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3772                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3773         }
3774     }
3775
3776     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3777
3778     h->mb_mbaff = 0;
3779     h->mb_aff_frame = 0;
3780     last_pic_structure = s0->picture_structure;
3781     if(h->sps.frame_mbs_only_flag){
3782         s->picture_structure= PICT_FRAME;
3783     }else{
3784         if(get_bits1(&s->gb)) { //field_pic_flag
3785             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3786         } else {
3787             s->picture_structure= PICT_FRAME;
3788             h->mb_aff_frame = h->sps.mb_aff;
3789         }
3790     }
3791     h->mb_field_decoding_flag= s->picture_structure != PICT_FRAME;
3792
3793     if(h0->current_slice == 0){
3794         while(h->frame_num !=  h->prev_frame_num &&
3795               h->frame_num != (h->prev_frame_num+1)%(1<<h->sps.log2_max_frame_num)){
3796             av_log(NULL, AV_LOG_DEBUG, "Frame num gap %d %d\n", h->frame_num, h->prev_frame_num);
3797             frame_start(h);
3798             h->prev_frame_num++;
3799             h->prev_frame_num %= 1<<h->sps.log2_max_frame_num;
3800             s->current_picture_ptr->frame_num= h->prev_frame_num;
3801             execute_ref_pic_marking(h, NULL, 0);
3802         }
3803
3804         /* See if we have a decoded first field looking for a pair... */
3805         if (s0->first_field) {
3806             assert(s0->current_picture_ptr);
3807             assert(s0->current_picture_ptr->data[0]);
3808             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
3809
3810             /* figure out if we have a complementary field pair */
3811             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
3812                 /*
3813                  * Previous field is unmatched. Don't display it, but let it
3814                  * remain for reference if marked as such.
3815                  */
3816                 s0->current_picture_ptr = NULL;
3817                 s0->first_field = FIELD_PICTURE;
3818
3819             } else {
3820                 if (h->nal_ref_idc &&
3821                         s0->current_picture_ptr->reference &&
3822                         s0->current_picture_ptr->frame_num != h->frame_num) {
3823                     /*
3824                      * This and previous field were reference, but had
3825                      * different frame_nums. Consider this field first in
3826                      * pair. Throw away previous field except for reference
3827                      * purposes.
3828                      */
3829                     s0->first_field = 1;
3830                     s0->current_picture_ptr = NULL;
3831
3832                 } else {
3833                     /* Second field in complementary pair */
3834                     s0->first_field = 0;
3835                 }
3836             }
3837
3838         } else {
3839             /* Frame or first field in a potentially complementary pair */
3840             assert(!s0->current_picture_ptr);
3841             s0->first_field = FIELD_PICTURE;
3842         }
3843
3844         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
3845             s0->first_field = 0;
3846             return -1;
3847         }
3848     }
3849     if(h != h0)
3850         clone_slice(h, h0);
3851
3852     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
3853
3854     assert(s->mb_num == s->mb_width * s->mb_height);
3855     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
3856        first_mb_in_slice                    >= s->mb_num){
3857         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
3858         return -1;
3859     }
3860     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
3861     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
3862     if (s->picture_structure == PICT_BOTTOM_FIELD)
3863         s->resync_mb_y = s->mb_y = s->mb_y + 1;
3864     assert(s->mb_y < s->mb_height);
3865
3866     if(s->picture_structure==PICT_FRAME){
3867         h->curr_pic_num=   h->frame_num;
3868         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
3869     }else{
3870         h->curr_pic_num= 2*h->frame_num + 1;
3871         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
3872     }
3873
3874     if(h->nal_unit_type == NAL_IDR_SLICE){
3875         get_ue_golomb(&s->gb); /* idr_pic_id */
3876     }
3877
3878     if(h->sps.poc_type==0){
3879         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
3880
3881         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
3882             h->delta_poc_bottom= get_se_golomb(&s->gb);
3883         }
3884     }
3885
3886     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
3887         h->delta_poc[0]= get_se_golomb(&s->gb);
3888
3889         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
3890             h->delta_poc[1]= get_se_golomb(&s->gb);
3891     }
3892
3893     init_poc(h);
3894
3895     if(h->pps.redundant_pic_cnt_present){
3896         h->redundant_pic_count= get_ue_golomb(&s->gb);
3897     }
3898
3899     //set defaults, might be overridden a few lines later
3900     h->ref_count[0]= h->pps.ref_count[0];
3901     h->ref_count[1]= h->pps.ref_count[1];
3902
3903     if(h->slice_type_nos != FF_I_TYPE){
3904         if(h->slice_type_nos == FF_B_TYPE){
3905             h->direct_spatial_mv_pred= get_bits1(&s->gb);
3906         }
3907         num_ref_idx_active_override_flag= get_bits1(&s->gb);
3908
3909         if(num_ref_idx_active_override_flag){
3910             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
3911             if(h->slice_type_nos==FF_B_TYPE)
3912                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
3913
3914             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
3915                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
3916                 h->ref_count[0]= h->ref_count[1]= 1;
3917                 return -1;
3918             }
3919         }
3920         if(h->slice_type_nos == FF_B_TYPE)
3921             h->list_count= 2;
3922         else
3923             h->list_count= 1;
3924     }else
3925         h->list_count= 0;
3926
3927     if(!default_ref_list_done){
3928         fill_default_ref_list(h);
3929     }
3930
3931     if(h->slice_type_nos!=FF_I_TYPE && decode_ref_pic_list_reordering(h) < 0)
3932         return -1;
3933
3934     if(h->slice_type_nos!=FF_I_TYPE){
3935         s->last_picture_ptr= &h->ref_list[0][0];
3936         ff_copy_picture(&s->last_picture, s->last_picture_ptr);
3937     }
3938     if(h->slice_type_nos==FF_B_TYPE){
3939         s->next_picture_ptr= &h->ref_list[1][0];
3940         ff_copy_picture(&s->next_picture, s->next_picture_ptr);
3941     }
3942
3943     if(   (h->pps.weighted_pred          && h->slice_type_nos == FF_P_TYPE )
3944        ||  (h->pps.weighted_bipred_idc==1 && h->slice_type_nos== FF_B_TYPE ) )
3945         pred_weight_table(h);
3946     else if(h->pps.weighted_bipred_idc==2 && h->slice_type_nos== FF_B_TYPE)
3947         implicit_weight_table(h);
3948     else {
3949         h->use_weight = 0;
3950         for (i = 0; i < 2; i++) {
3951             h->luma_weight_flag[i]   = 0;
3952             h->chroma_weight_flag[i] = 0;
3953         }
3954     }
3955
3956     if(h->nal_ref_idc)
3957         decode_ref_pic_marking(h0, &s->gb);
3958
3959     if(FRAME_MBAFF)
3960         fill_mbaff_ref_list(h);
3961
3962     if(h->slice_type_nos==FF_B_TYPE && !h->direct_spatial_mv_pred)
3963         direct_dist_scale_factor(h);
3964     direct_ref_list_init(h);
3965
3966     if( h->slice_type_nos != FF_I_TYPE && h->pps.cabac ){
3967         tmp = get_ue_golomb_31(&s->gb);
3968         if(tmp > 2){
3969             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
3970             return -1;
3971         }
3972         h->cabac_init_idc= tmp;
3973     }
3974
3975     h->last_qscale_diff = 0;
3976     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
3977     if(tmp>51){
3978         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
3979         return -1;
3980     }
3981     s->qscale= tmp;
3982     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
3983     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
3984     //FIXME qscale / qp ... stuff
3985     if(h->slice_type == FF_SP_TYPE){
3986         get_bits1(&s->gb); /* sp_for_switch_flag */
3987     }
3988     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
3989         get_se_golomb(&s->gb); /* slice_qs_delta */
3990     }
3991
3992     h->deblocking_filter = 1;
3993     h->slice_alpha_c0_offset = 0;
3994     h->slice_beta_offset = 0;
3995     if( h->pps.deblocking_filter_parameters_present ) {
3996         tmp= get_ue_golomb_31(&s->gb);
3997         if(tmp > 2){
3998             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
3999             return -1;
4000         }
4001         h->deblocking_filter= tmp;
4002         if(h->deblocking_filter < 2)
4003             h->deblocking_filter^= 1; // 1<->0
4004
4005         if( h->deblocking_filter ) {
4006             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4007             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4008         }
4009     }
4010
4011     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4012        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type_nos != FF_I_TYPE)
4013        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type_nos == FF_B_TYPE)
4014        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4015         h->deblocking_filter= 0;
4016
4017     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
4018         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
4019             /* Cheat slightly for speed:
4020                Do not bother to deblock across slices. */
4021             h->deblocking_filter = 2;
4022         } else {
4023             h0->max_contexts = 1;
4024             if(!h0->single_decode_warning) {
4025                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
4026                 h0->single_decode_warning = 1;
4027             }
4028             if(h != h0)
4029                 return 1; // deblocking switched inside frame
4030         }
4031     }
4032
4033 #if 0 //FMO
4034     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4035         slice_group_change_cycle= get_bits(&s->gb, ?);
4036 #endif
4037
4038     h0->last_slice_type = slice_type;
4039     h->slice_num = ++h0->current_slice;
4040     if(h->slice_num >= MAX_SLICES){
4041         av_log(s->avctx, AV_LOG_ERROR, "Too many slices, increase MAX_SLICES and recompile\n");
4042     }
4043
4044     for(j=0; j<2; j++){
4045         int *ref2frm= h->ref2frm[h->slice_num&(MAX_SLICES-1)][j];
4046         ref2frm[0]=
4047         ref2frm[1]= -1;
4048         for(i=0; i<16; i++)
4049             ref2frm[i+2]= 4*h->ref_list[j][i].frame_num
4050                           +(h->ref_list[j][i].reference&3);
4051         ref2frm[18+0]=
4052         ref2frm[18+1]= -1;
4053         for(i=16; i<48; i++)
4054             ref2frm[i+4]= 4*h->ref_list[j][i].frame_num
4055                           +(h->ref_list[j][i].reference&3);
4056     }
4057
4058     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4059     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
4060
4061     s->avctx->refs= h->sps.ref_frame_count;
4062
4063     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4064         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c%s%s pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s %s\n",
4065                h->slice_num,
4066                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4067                first_mb_in_slice,
4068                av_get_pict_type_char(h->slice_type), h->slice_type_fixed ? " fix" : "", h->nal_unit_type == NAL_IDR_SLICE ? " IDR" : "",
4069                pps_id, h->frame_num,
4070                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4071                h->ref_count[0], h->ref_count[1],
4072                s->qscale,
4073                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4074                h->use_weight,
4075                h->use_weight==1 && h->use_weight_chroma ? "c" : "",
4076                h->slice_type == FF_B_TYPE ? (h->direct_spatial_mv_pred ? "SPAT" : "TEMP") : ""
4077                );
4078     }
4079
4080     return 0;
4081 }
4082
4083 /**
4084  *
4085  */
4086 static inline int get_level_prefix(GetBitContext *gb){
4087     unsigned int buf;
4088     int log;
4089
4090     OPEN_READER(re, gb);
4091     UPDATE_CACHE(re, gb);
4092     buf=GET_CACHE(re, gb);
4093
4094     log= 32 - av_log2(buf);
4095 #ifdef TRACE
4096     print_bin(buf>>(32-log), log);
4097     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4098 #endif
4099
4100     LAST_SKIP_BITS(re, gb, log);
4101     CLOSE_READER(re, gb);
4102
4103     return log-1;
4104 }
4105
4106 static inline int get_dct8x8_allowed(H264Context *h){
4107     if(h->sps.direct_8x8_inference_flag)
4108         return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8                )*0x0001000100010001ULL));
4109     else
4110         return !(*(uint64_t*)h->sub_mb_type & ((MB_TYPE_16x8|MB_TYPE_8x16|MB_TYPE_8x8|MB_TYPE_DIRECT2)*0x0001000100010001ULL));
4111 }
4112
4113 /**
4114  * decodes a residual block.
4115  * @param n block index
4116  * @param scantable scantable
4117  * @param max_coeff number of coefficients in the block
4118  * @return <0 if an error occurred
4119  */
4120 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4121     MpegEncContext * const s = &h->s;
4122     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4123     int level[16];
4124     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4125
4126     //FIXME put trailing_onex into the context
4127
4128     if(n == CHROMA_DC_BLOCK_INDEX){
4129         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4130         total_coeff= coeff_token>>2;
4131     }else{
4132         if(n == LUMA_DC_BLOCK_INDEX){
4133             total_coeff= pred_non_zero_count(h, 0);
4134             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4135             total_coeff= coeff_token>>2;
4136         }else{
4137             total_coeff= pred_non_zero_count(h, n);
4138             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4139             total_coeff= coeff_token>>2;
4140             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4141         }
4142     }
4143
4144     //FIXME set last_non_zero?
4145
4146     if(total_coeff==0)
4147         return 0;
4148     if(total_coeff > (unsigned)max_coeff) {
4149         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4150         return -1;
4151     }
4152
4153     trailing_ones= coeff_token&3;
4154     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4155     assert(total_coeff<=16);
4156
4157     i = show_bits(gb, 3);
4158     skip_bits(gb, trailing_ones);
4159     level[0] = 1-((i&4)>>1);
4160     level[1] = 1-((i&2)   );
4161     level[2] = 1-((i&1)<<1);
4162
4163     if(trailing_ones<total_coeff) {
4164         int mask, prefix;
4165         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4166         int bitsi= show_bits(gb, LEVEL_TAB_BITS);
4167         int level_code= cavlc_level_tab[suffix_length][bitsi][0];
4168
4169         skip_bits(gb, cavlc_level_tab[suffix_length][bitsi][1]);
4170         if(level_code >= 100){
4171             prefix= level_code - 100;
4172             if(prefix == LEVEL_TAB_BITS)
4173                 prefix += get_level_prefix(gb);
4174
4175             //first coefficient has suffix_length equal to 0 or 1
4176             if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4177                 if(suffix_length)
4178                     level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4179                 else
4180                     level_code= (prefix<<suffix_length); //part
4181             }else if(prefix==14){
4182                 if(suffix_length)
4183                     level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4184                 else
4185                     level_code= prefix + get_bits(gb, 4); //part
4186             }else{
4187                 level_code= (15<<suffix_length) + get_bits(gb, prefix-3); //part
4188                 if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4189                 if(prefix>=16)
4190                     level_code += (1<<(prefix-3))-4096;
4191             }
4192
4193             if(trailing_ones < 3) level_code += 2;
4194
4195             suffix_length = 2;
4196             mask= -(level_code&1);
4197             level[trailing_ones]= (((2+level_code)>>1) ^ mask) - mask;
4198         }else{
4199             if(trailing_ones < 3) level_code += (level_code>>31)|1;
4200
4201             suffix_length = 1;
4202             if(level_code + 3U > 6U)
4203                 suffix_length++;
4204             level[trailing_ones]= level_code;
4205         }
4206
4207         //remaining coefficients have suffix_length > 0
4208         for(i=trailing_ones+1;i<total_coeff;i++) {
4209             static const unsigned int suffix_limit[7] = {0,3,6,12,24,48,INT_MAX };
4210             int bitsi= show_bits(gb, LEVEL_TAB_BITS);
4211             level_code= cavlc_level_tab[suffix_length][bitsi][0];
4212
4213             skip_bits(gb, cavlc_level_tab[suffix_length][bitsi][1]);
4214             if(level_code >= 100){
4215                 prefix= level_code - 100;
4216                 if(prefix == LEVEL_TAB_BITS){
4217                     prefix += get_level_prefix(gb);
4218                 }
4219                 if(prefix<15){
4220                     level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4221                 }else{
4222                     level_code = (15<<suffix_length) + get_bits(gb, prefix-3);
4223                     if(prefix>=16)
4224                         level_code += (1<<(prefix-3))-4096;
4225                 }
4226                 mask= -(level_code&1);
4227                 level_code= (((2+level_code)>>1) ^ mask) - mask;
4228             }
4229             level[i]= level_code;
4230
4231             if(suffix_limit[suffix_length] + level_code > 2U*suffix_limit[suffix_length])
4232                 suffix_length++;
4233         }
4234     }
4235
4236     if(total_coeff == max_coeff)
4237         zeros_left=0;
4238     else{
4239         if(n == CHROMA_DC_BLOCK_INDEX)
4240             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4241         else
4242             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4243     }
4244
4245     coeff_num = zeros_left + total_coeff - 1;
4246     j = scantable[coeff_num];
4247     if(n > 24){
4248         block[j] = level[0];
4249         for(i=1;i<total_coeff;i++) {
4250             if(zeros_left <= 0)
4251                 run_before = 0;
4252             else if(zeros_left < 7){
4253                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4254             }else{
4255                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4256             }
4257             zeros_left -= run_before;
4258             coeff_num -= 1 + run_before;
4259             j= scantable[ coeff_num ];
4260
4261             block[j]= level[i];
4262         }
4263     }else{
4264         block[j] = (level[0] * qmul[j] + 32)>>6;
4265         for(i=1;i<total_coeff;i++) {
4266             if(zeros_left <= 0)
4267                 run_before = 0;
4268             else if(zeros_left < 7){
4269                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4270             }else{
4271                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4272             }
4273             zeros_left -= run_before;
4274             coeff_num -= 1 + run_before;
4275             j= scantable[ coeff_num ];
4276
4277             block[j]= (level[i] * qmul[j] + 32)>>6;
4278         }
4279     }
4280
4281     if(zeros_left<0){
4282         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4283         return -1;
4284     }
4285
4286     return 0;
4287 }
4288
4289 static void predict_field_decoding_flag(H264Context *h){
4290     MpegEncContext * const s = &h->s;
4291     const int mb_xy= h->mb_xy;
4292     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4293                 ? s->current_picture.mb_type[mb_xy-1]
4294                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4295                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4296                 : 0;
4297     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4298 }
4299
4300 /**
4301  * decodes a P_SKIP or B_SKIP macroblock
4302  */
4303 static void decode_mb_skip(H264Context *h){
4304     MpegEncContext * const s = &h->s;
4305     const int mb_xy= h->mb_xy;
4306     int mb_type=0;
4307
4308     memset(h->non_zero_count[mb_xy], 0, 16);
4309     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4310
4311     if(MB_FIELD)
4312         mb_type|= MB_TYPE_INTERLACED;
4313
4314     if( h->slice_type_nos == FF_B_TYPE )
4315     {
4316         // just for fill_caches. pred_direct_motion will set the real mb_type
4317         mb_type|= MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4318
4319         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4320         pred_direct_motion(h, &mb_type);
4321         mb_type|= MB_TYPE_SKIP;
4322     }
4323     else
4324     {
4325         int mx, my;
4326         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4327
4328         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4329         pred_pskip_motion(h, &mx, &my);
4330         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4331         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4332     }
4333
4334     write_back_motion(h, mb_type);
4335     s->current_picture.mb_type[mb_xy]= mb_type;
4336     s->current_picture.qscale_table[mb_xy]= s->qscale;
4337     h->slice_table[ mb_xy ]= h->slice_num;
4338     h->prev_mb_skipped= 1;
4339 }
4340
4341 /**
4342  * decodes a macroblock
4343  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4344  */
4345 static int decode_mb_cavlc(H264Context *h){
4346     MpegEncContext * const s = &h->s;
4347     int mb_xy;
4348     int partition_count;
4349     unsigned int mb_type, cbp;
4350     int dct8x8_allowed= h->pps.transform_8x8_mode;
4351
4352     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4353
4354     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4355     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4356                 down the code */
4357     if(h->slice_type_nos != FF_I_TYPE){
4358         if(s->mb_skip_run==-1)
4359             s->mb_skip_run= get_ue_golomb(&s->gb);
4360
4361         if (s->mb_skip_run--) {
4362             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4363                 if(s->mb_skip_run==0)
4364                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4365                 else
4366                     predict_field_decoding_flag(h);
4367             }
4368             decode_mb_skip(h);
4369             return 0;
4370         }
4371     }
4372     if(FRAME_MBAFF){
4373         if( (s->mb_y&1) == 0 )
4374             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4375     }
4376
4377     h->prev_mb_skipped= 0;
4378
4379     mb_type= get_ue_golomb(&s->gb);
4380     if(h->slice_type_nos == FF_B_TYPE){
4381         if(mb_type < 23){
4382             partition_count= b_mb_type_info[mb_type].partition_count;
4383             mb_type=         b_mb_type_info[mb_type].type;
4384         }else{
4385             mb_type -= 23;
4386             goto decode_intra_mb;
4387         }
4388     }else if(h->slice_type_nos == FF_P_TYPE){
4389         if(mb_type < 5){
4390             partition_count= p_mb_type_info[mb_type].partition_count;
4391             mb_type=         p_mb_type_info[mb_type].type;
4392         }else{
4393             mb_type -= 5;
4394             goto decode_intra_mb;
4395         }
4396     }else{
4397        assert(h->slice_type_nos == FF_I_TYPE);
4398         if(h->slice_type == FF_SI_TYPE && mb_type)
4399             mb_type--;
4400 decode_intra_mb:
4401         if(mb_type > 25){
4402             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4403             return -1;
4404         }
4405         partition_count=0;
4406         cbp= i_mb_type_info[mb_type].cbp;
4407         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4408         mb_type= i_mb_type_info[mb_type].type;
4409     }
4410
4411     if(MB_FIELD)
4412         mb_type |= MB_TYPE_INTERLACED;
4413
4414     h->slice_table[ mb_xy ]= h->slice_num;
4415
4416     if(IS_INTRA_PCM(mb_type)){
4417         unsigned int x;
4418
4419         // We assume these blocks are very rare so we do not optimize it.
4420         align_get_bits(&s->gb);
4421
4422         // The pixels are stored in the same order as levels in h->mb array.
4423         for(x=0; x < (CHROMA ? 384 : 256); x++){
4424             ((uint8_t*)h->mb)[x]= get_bits(&s->gb, 8);
4425         }
4426
4427         // In deblocking, the quantizer is 0
4428         s->current_picture.qscale_table[mb_xy]= 0;
4429         // All coeffs are present
4430         memset(h->non_zero_count[mb_xy], 16, 16);
4431
4432         s->current_picture.mb_type[mb_xy]= mb_type;
4433         return 0;
4434     }
4435
4436     if(MB_MBAFF){
4437         h->ref_count[0] <<= 1;
4438         h->ref_count[1] <<= 1;
4439     }
4440
4441     fill_caches(h, mb_type, 0);
4442
4443     //mb_pred
4444     if(IS_INTRA(mb_type)){
4445         int pred_mode;
4446 //            init_top_left_availability(h);
4447         if(IS_INTRA4x4(mb_type)){
4448             int i;
4449             int di = 1;
4450             if(dct8x8_allowed && get_bits1(&s->gb)){
4451                 mb_type |= MB_TYPE_8x8DCT;
4452                 di = 4;
4453             }
4454
4455 //                fill_intra4x4_pred_table(h);
4456             for(i=0; i<16; i+=di){
4457                 int mode= pred_intra_mode(h, i);
4458
4459                 if(!get_bits1(&s->gb)){
4460                     const int rem_mode= get_bits(&s->gb, 3);
4461                     mode = rem_mode + (rem_mode >= mode);
4462                 }
4463
4464                 if(di==4)
4465                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4466                 else
4467                     h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4468             }
4469             write_back_intra_pred_mode(h);
4470             if( check_intra4x4_pred_mode(h) < 0)
4471                 return -1;
4472         }else{
4473             h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4474             if(h->intra16x16_pred_mode < 0)
4475                 return -1;
4476         }
4477         if(CHROMA){
4478             pred_mode= check_intra_pred_mode(h, get_ue_golomb_31(&s->gb));
4479             if(pred_mode < 0)
4480                 return -1;
4481             h->chroma_pred_mode= pred_mode;
4482         }
4483     }else if(partition_count==4){
4484         int i, j, sub_partition_count[4], list, ref[2][4];
4485
4486         if(h->slice_type_nos == FF_B_TYPE){
4487             for(i=0; i<4; i++){
4488                 h->sub_mb_type[i]= get_ue_golomb_31(&s->gb);
4489                 if(h->sub_mb_type[i] >=13){
4490                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4491                     return -1;
4492                 }
4493                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4494                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4495             }
4496             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4497                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4498                 pred_direct_motion(h, &mb_type);
4499                 h->ref_cache[0][scan8[4]] =
4500                 h->ref_cache[1][scan8[4]] =
4501                 h->ref_cache[0][scan8[12]] =
4502                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4503             }
4504         }else{
4505             assert(h->slice_type_nos == FF_P_TYPE); //FIXME SP correct ?
4506             for(i=0; i<4; i++){
4507                 h->sub_mb_type[i]= get_ue_golomb_31(&s->gb);
4508                 if(h->sub_mb_type[i] >=4){
4509                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4510                     return -1;
4511                 }
4512                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4513                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4514             }
4515         }
4516
4517         for(list=0; list<h->list_count; list++){
4518             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4519             for(i=0; i<4; i++){
4520                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4521                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4522                     unsigned int tmp;
4523                     if(ref_count == 1){
4524                         tmp= 0;
4525                     }else if(ref_count == 2){
4526                         tmp= get_bits1(&s->gb)^1;
4527                     }else{
4528                         tmp= get_ue_golomb_31(&s->gb);
4529                         if(tmp>=ref_count){
4530                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4531                             return -1;
4532                         }
4533                     }
4534                     ref[list][i]= tmp;
4535                 }else{
4536                  //FIXME
4537                     ref[list][i] = -1;
4538                 }
4539             }
4540         }
4541
4542         if(dct8x8_allowed)
4543             dct8x8_allowed = get_dct8x8_allowed(h);
4544
4545         for(list=0; list<h->list_count; list++){
4546             for(i=0; i<4; i++){
4547                 if(IS_DIRECT(h->sub_mb_type[i])) {
4548                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4549                     continue;
4550                 }
4551                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4552                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4553
4554                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4555                     const int sub_mb_type= h->sub_mb_type[i];
4556                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4557                     for(j=0; j<sub_partition_count[i]; j++){
4558                         int mx, my;
4559                         const int index= 4*i + block_width*j;
4560                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4561                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4562                         mx += get_se_golomb(&s->gb);
4563                         my += get_se_golomb(&s->gb);
4564                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4565
4566                         if(IS_SUB_8X8(sub_mb_type)){
4567                             mv_cache[ 1 ][0]=
4568                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4569                             mv_cache[ 1 ][1]=
4570                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4571                         }else if(IS_SUB_8X4(sub_mb_type)){
4572                             mv_cache[ 1 ][0]= mx;
4573                             mv_cache[ 1 ][1]= my;
4574                         }else if(IS_SUB_4X8(sub_mb_type)){
4575                             mv_cache[ 8 ][0]= mx;
4576                             mv_cache[ 8 ][1]= my;
4577                         }
4578                         mv_cache[ 0 ][0]= mx;
4579                         mv_cache[ 0 ][1]= my;
4580                     }
4581                 }else{
4582                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4583                     p[0] = p[1]=
4584                     p[8] = p[9]= 0;
4585                 }
4586             }
4587         }
4588     }else if(IS_DIRECT(mb_type)){
4589         pred_direct_motion(h, &mb_type);
4590         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4591     }else{
4592         int list, mx, my, i;
4593          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4594         if(IS_16X16(mb_type)){
4595             for(list=0; list<h->list_count; list++){
4596                     unsigned int val;
4597                     if(IS_DIR(mb_type, 0, list)){
4598                         if(h->ref_count[list]==1){
4599                             val= 0;
4600                         }else if(h->ref_count[list]==2){
4601                             val= get_bits1(&s->gb)^1;
4602                         }else{
4603                             val= get_ue_golomb_31(&s->gb);
4604                             if(val >= h->ref_count[list]){
4605                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4606                                 return -1;
4607                             }
4608                         }
4609                     }else
4610                         val= LIST_NOT_USED&0xFF;
4611                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4612             }
4613             for(list=0; list<h->list_count; list++){
4614                 unsigned int val;
4615                 if(IS_DIR(mb_type, 0, list)){
4616                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4617                     mx += get_se_golomb(&s->gb);
4618                     my += get_se_golomb(&s->gb);
4619                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4620
4621                     val= pack16to32(mx,my);
4622                 }else
4623                     val=0;
4624                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4625             }
4626         }
4627         else if(IS_16X8(mb_type)){
4628             for(list=0; list<h->list_count; list++){
4629                     for(i=0; i<2; i++){
4630                         unsigned int val;
4631                         if(IS_DIR(mb_type, i, list)){
4632                             if(h->ref_count[list] == 1){
4633                                 val= 0;
4634                             }else if(h->ref_count[list] == 2){
4635                                 val= get_bits1(&s->gb)^1;
4636                             }else{
4637                                 val= get_ue_golomb_31(&s->gb);
4638                                 if(val >= h->ref_count[list]){
4639                                     av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4640                                     return -1;
4641                                 }
4642                             }
4643                         }else
4644                             val= LIST_NOT_USED&0xFF;
4645                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4646                     }
4647             }
4648             for(list=0; list<h->list_count; list++){
4649                 for(i=0; i<2; i++){
4650                     unsigned int val;
4651                     if(IS_DIR(mb_type, i, list)){
4652                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4653                         mx += get_se_golomb(&s->gb);
4654                         my += get_se_golomb(&s->gb);
4655                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4656
4657                         val= pack16to32(mx,my);
4658                     }else
4659                         val=0;
4660                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4661                 }
4662             }
4663         }else{
4664             assert(IS_8X16(mb_type));
4665             for(list=0; list<h->list_count; list++){
4666                     for(i=0; i<2; i++){
4667                         unsigned int val;
4668                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4669                             if(h->ref_count[list]==1){
4670                                 val= 0;
4671                             }else if(h->ref_count[list]==2){
4672                                 val= get_bits1(&s->gb)^1;
4673                             }else{
4674                                 val= get_ue_golomb_31(&s->gb);
4675                                 if(val >= h->ref_count[list]){
4676                                     av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4677                                     return -1;
4678                                 }
4679                             }
4680                         }else
4681                             val= LIST_NOT_USED&0xFF;
4682                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4683                     }
4684             }
4685             for(list=0; list<h->list_count; list++){
4686                 for(i=0; i<2; i++){
4687                     unsigned int val;
4688                     if(IS_DIR(mb_type, i, list)){
4689                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4690                         mx += get_se_golomb(&s->gb);
4691                         my += get_se_golomb(&s->gb);
4692                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4693
4694                         val= pack16to32(mx,my);
4695                     }else
4696                         val=0;
4697                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4698                 }
4699             }
4700         }
4701     }
4702
4703     if(IS_INTER(mb_type))
4704         write_back_motion(h, mb_type);
4705
4706     if(!IS_INTRA16x16(mb_type)){
4707         cbp= get_ue_golomb(&s->gb);
4708         if(cbp > 47){
4709             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4710             return -1;
4711         }
4712
4713         if(CHROMA){
4714             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp[cbp];
4715             else                     cbp= golomb_to_inter_cbp   [cbp];
4716         }else{
4717             if(IS_INTRA4x4(mb_type)) cbp= golomb_to_intra4x4_cbp_gray[cbp];
4718             else                     cbp= golomb_to_inter_cbp_gray[cbp];
4719         }
4720     }
4721     h->cbp = cbp;
4722
4723     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4724         if(get_bits1(&s->gb)){
4725             mb_type |= MB_TYPE_8x8DCT;
4726             h->cbp_table[mb_xy]= cbp;
4727         }
4728     }
4729     s->current_picture.mb_type[mb_xy]= mb_type;
4730
4731     if(cbp || IS_INTRA16x16(mb_type)){
4732         int i8x8, i4x4, chroma_idx;
4733         int dquant;
4734         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4735         const uint8_t *scan, *scan8x8, *dc_scan;
4736
4737 //        fill_non_zero_count_cache(h);
4738
4739         if(IS_INTERLACED(mb_type)){
4740             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4741             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4742             dc_scan= luma_dc_field_scan;
4743         }else{
4744             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4745             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4746             dc_scan= luma_dc_zigzag_scan;
4747         }
4748
4749         dquant= get_se_golomb(&s->gb);
4750
4751         if( dquant > 25 || dquant < -26 ){
4752             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4753             return -1;
4754         }
4755
4756         s->qscale += dquant;
4757         if(((unsigned)s->qscale) > 51){
4758             if(s->qscale<0) s->qscale+= 52;
4759             else            s->qscale-= 52;
4760         }
4761
4762         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4763         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4764         if(IS_INTRA16x16(mb_type)){
4765             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4766                 return -1; //FIXME continue if partitioned and other return -1 too
4767             }
4768
4769             assert((cbp&15) == 0 || (cbp&15) == 15);
4770
4771             if(cbp&15){
4772                 for(i8x8=0; i8x8<4; i8x8++){
4773                     for(i4x4=0; i4x4<4; i4x4++){
4774                         const int index= i4x4 + 4*i8x8;
4775                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4776                             return -1;
4777                         }
4778                     }
4779                 }
4780             }else{
4781                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4782             }
4783         }else{
4784             for(i8x8=0; i8x8<4; i8x8++){
4785                 if(cbp & (1<<i8x8)){
4786                     if(IS_8x8DCT(mb_type)){
4787                         DCTELEM *buf = &h->mb[64*i8x8];
4788                         uint8_t *nnz;
4789                         for(i4x4=0; i4x4<4; i4x4++){
4790                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4791                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4792                                 return -1;
4793                         }
4794                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4795                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4796                     }else{
4797                         for(i4x4=0; i4x4<4; i4x4++){
4798                             const int index= i4x4 + 4*i8x8;
4799
4800                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4801                                 return -1;
4802                             }
4803                         }
4804                     }
4805                 }else{
4806                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4807                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4808                 }
4809             }
4810         }
4811
4812         if(cbp&0x30){
4813             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4814                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4815                     return -1;
4816                 }
4817         }
4818
4819         if(cbp&0x20){
4820             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4821                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4822                 for(i4x4=0; i4x4<4; i4x4++){
4823                     const int index= 16 + 4*chroma_idx + i4x4;
4824                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4825                         return -1;
4826                     }
4827                 }
4828             }
4829         }else{
4830             uint8_t * const nnz= &h->non_zero_count_cache[0];
4831             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4832             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4833         }
4834     }else{
4835         uint8_t * const nnz= &h->non_zero_count_cache[0];
4836         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4837         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4838         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4839     }
4840     s->current_picture.qscale_table[mb_xy]= s->qscale;
4841     write_back_non_zero_count(h);
4842
4843     if(MB_MBAFF){
4844         h->ref_count[0] >>= 1;
4845         h->ref_count[1] >>= 1;
4846     }
4847
4848     return 0;
4849 }
4850
4851 static int decode_cabac_field_decoding_flag(H264Context *h) {
4852     MpegEncContext * const s = &h->s;
4853     const int mb_x = s->mb_x;
4854     const int mb_y = s->mb_y & ~1;
4855     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4856     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4857
4858     unsigned int ctx = 0;
4859
4860     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
4861         ctx += 1;
4862     }
4863     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
4864         ctx += 1;
4865     }
4866
4867     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
4868 }
4869
4870 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4871     uint8_t *state= &h->cabac_state[ctx_base];
4872     int mb_type;
4873
4874     if(intra_slice){
4875         MpegEncContext * const s = &h->s;
4876         const int mba_xy = h->left_mb_xy[0];
4877         const int mbb_xy = h->top_mb_xy;
4878         int ctx=0;
4879         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
4880             ctx++;
4881         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
4882             ctx++;
4883         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
4884             return 0;   /* I4x4 */
4885         state += 2;
4886     }else{
4887         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
4888             return 0;   /* I4x4 */
4889     }
4890
4891     if( get_cabac_terminate( &h->cabac ) )
4892         return 25;  /* PCM */
4893
4894     mb_type = 1; /* I16x16 */
4895     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
4896     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
4897         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
4898     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
4899     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
4900     return mb_type;
4901 }
4902
4903 static int decode_cabac_mb_type_b( H264Context *h ) {
4904     MpegEncContext * const s = &h->s;
4905
4906         const int mba_xy = h->left_mb_xy[0];
4907         const int mbb_xy = h->top_mb_xy;
4908         int ctx = 0;
4909         int bits;
4910         assert(h->slice_type_nos == FF_B_TYPE);
4911
4912         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
4913             ctx++;
4914         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
4915             ctx++;
4916
4917         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
4918             return 0; /* B_Direct_16x16 */
4919
4920         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
4921             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
4922         }
4923
4924         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
4925         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
4926         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
4927         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4928         if( bits < 8 )
4929             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
4930         else if( bits == 13 ) {
4931             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
4932         } else if( bits == 14 )
4933             return 11; /* B_L1_L0_8x16 */
4934         else if( bits == 15 )
4935             return 22; /* B_8x8 */
4936
4937         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
4938         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
4939 }
4940
4941 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
4942     MpegEncContext * const s = &h->s;
4943     int mba_xy, mbb_xy;
4944     int ctx = 0;
4945
4946     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
4947         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
4948         mba_xy = mb_xy - 1;
4949         if( (mb_y&1)
4950             && h->slice_table[mba_xy] == h->slice_num
4951             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
4952             mba_xy += s->mb_stride;
4953         if( MB_FIELD ){
4954             mbb_xy = mb_xy - s->mb_stride;
4955             if( !(mb_y&1)
4956                 && h->slice_table[mbb_xy] == h->slice_num
4957                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
4958                 mbb_xy -= s->mb_stride;
4959         }else
4960             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
4961     }else{
4962         int mb_xy = h->mb_xy;
4963         mba_xy = mb_xy - 1;
4964         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
4965     }
4966
4967     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
4968         ctx++;
4969     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
4970         ctx++;
4971
4972     if( h->slice_type_nos == FF_B_TYPE )
4973         ctx += 13;
4974     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
4975 }
4976
4977 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
4978     int mode = 0;
4979
4980     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
4981         return pred_mode;
4982
4983     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
4984     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
4985     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
4986
4987     if( mode >= pred_mode )
4988         return mode + 1;
4989     else
4990         return mode;
4991 }
4992
4993 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
4994     const int mba_xy = h->left_mb_xy[0];
4995     const int mbb_xy = h->top_mb_xy;
4996
4997     int ctx = 0;
4998
4999     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
5000     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
5001         ctx++;
5002
5003     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
5004         ctx++;
5005
5006     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
5007         return 0;
5008
5009     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5010         return 1;
5011     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5012         return 2;
5013     else
5014         return 3;
5015 }
5016
5017 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5018     int cbp_b, cbp_a, ctx, cbp = 0;
5019
5020     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
5021     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
5022
5023     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
5024     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
5025     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
5026     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
5027     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
5028     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
5029     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
5030     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
5031     return cbp;
5032 }
5033 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5034     int ctx;
5035     int cbp_a, cbp_b;
5036
5037     cbp_a = (h->left_cbp>>4)&0x03;
5038     cbp_b = (h-> top_cbp>>4)&0x03;
5039
5040     ctx = 0;
5041     if( cbp_a > 0 ) ctx++;
5042     if( cbp_b > 0 ) ctx += 2;
5043     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5044         return 0;
5045
5046     ctx = 4;
5047     if( cbp_a == 2 ) ctx++;
5048     if( cbp_b == 2 ) ctx += 2;
5049     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5050 }
5051 static int decode_cabac_mb_dqp( H264Context *h) {
5052     int   ctx= h->last_qscale_diff != 0;
5053     int   val = 0;
5054
5055     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5056         ctx= 2+(ctx>>1);
5057         val++;
5058         if(val > 102) //prevent infinite loop
5059             return INT_MIN;
5060     }
5061
5062     if( val&0x01 )
5063         return   (val + 1)>>1 ;
5064     else
5065         return -((val + 1)>>1);
5066 }
5067 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5068     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5069         return 0;   /* 8x8 */
5070     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5071         return 1;   /* 8x4 */
5072     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5073         return 2;   /* 4x8 */
5074     return 3;       /* 4x4 */
5075 }
5076 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5077     int type;
5078     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5079         return 0;   /* B_Direct_8x8 */
5080     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5081         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5082     type = 3;
5083     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5084         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5085             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5086         type += 4;
5087     }
5088     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5089     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5090     return type;
5091 }
5092
5093 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5094     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5095 }
5096
5097 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5098     int refa = h->ref_cache[list][scan8[n] - 1];
5099     int refb = h->ref_cache[list][scan8[n] - 8];
5100     int ref  = 0;
5101     int ctx  = 0;
5102
5103     if( h->slice_type_nos == FF_B_TYPE) {
5104         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5105             ctx++;
5106         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5107             ctx += 2;
5108     } else {
5109         if( refa > 0 )
5110             ctx++;
5111         if( refb > 0 )
5112             ctx += 2;
5113     }
5114
5115     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5116         ref++;
5117         ctx = (ctx>>2)+4;
5118         if(ref >= 32 /*h->ref_list[list]*/){
5119             return -1;
5120         }
5121     }
5122     return ref;
5123 }
5124
5125 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5126     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5127                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5128     int ctxbase = (l == 0) ? 40 : 47;
5129     int mvd;
5130     int ctx = (amvd>2) + (amvd>32);
5131
5132     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5133         return 0;
5134
5135     mvd= 1;
5136     ctx= 3;
5137     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5138         mvd++;
5139         if( ctx < 6 )
5140             ctx++;
5141     }
5142
5143     if( mvd >= 9 ) {
5144         int k = 3;
5145         while( get_cabac_bypass( &h->cabac ) ) {
5146             mvd += 1 << k;
5147             k++;
5148             if(k>24){
5149                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5150                 return INT_MIN;
5151             }
5152         }
5153         while( k-- ) {
5154             if( get_cabac_bypass( &h->cabac ) )
5155                 mvd += 1 << k;
5156         }
5157     }
5158     return get_cabac_bypass_sign( &h->cabac, -mvd );
5159 }
5160
5161 static av_always_inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx, int is_dc ) {
5162     int nza, nzb;
5163     int ctx = 0;
5164
5165     if( is_dc ) {
5166         if( cat == 0 ) {
5167             nza = h->left_cbp&0x100;
5168             nzb = h-> top_cbp&0x100;
5169         } else {
5170             nza = (h->left_cbp>>(6+idx))&0x01;
5171             nzb = (h-> top_cbp>>(6+idx))&0x01;
5172         }
5173     } else {
5174         assert(cat == 1 || cat == 2 || cat == 4);
5175         nza = h->non_zero_count_cache[scan8[idx] - 1];
5176         nzb = h->non_zero_count_cache[scan8[idx] - 8];
5177     }
5178
5179     if( nza > 0 )
5180         ctx++;
5181
5182     if( nzb > 0 )
5183         ctx += 2;
5184
5185     return ctx + 4 * cat;
5186 }
5187
5188 DECLARE_ASM_CONST(1, uint8_t, last_coeff_flag_offset_8x8[63]) = {
5189     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5190     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5191     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5192     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5193 };
5194
5195 static av_always_inline void decode_cabac_residual_internal( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff, int is_dc ) {
5196     static const int significant_coeff_flag_offset[2][6] = {
5197       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5198       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5199     };
5200     static const int last_coeff_flag_offset[2][6] = {
5201       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5202       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5203     };
5204     static const int coeff_abs_level_m1_offset[6] = {
5205         227+0, 227+10, 227+20, 227+30, 227+39, 426
5206     };
5207     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5208       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5209         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5210         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5211        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5212       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5213         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5214         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5215         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5216     };
5217     /* node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
5218      * 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
5219      * map node ctx => cabac ctx for level=1 */
5220     static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
5221     /* map node ctx => cabac ctx for level>1 */
5222     static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
5223     static const uint8_t coeff_abs_level_transition[2][8] = {
5224     /* update node ctx after decoding a level=1 */
5225         { 1, 2, 3, 3, 4, 5, 6, 7 },
5226     /* update node ctx after decoding a level>1 */
5227         { 4, 4, 4, 4, 5, 6, 7, 7 }
5228     };
5229
5230     int index[64];
5231
5232     int av_unused last;
5233     int coeff_count = 0;
5234     int node_ctx = 0;
5235
5236     uint8_t *significant_coeff_ctx_base;
5237     uint8_t *last_coeff_ctx_base;
5238     uint8_t *abs_level_m1_ctx_base;
5239
5240 #if !ARCH_X86
5241 #define CABAC_ON_STACK
5242 #endif
5243 #ifdef CABAC_ON_STACK
5244 #define CC &cc
5245     CABACContext cc;
5246     cc.range     = h->cabac.range;
5247     cc.low       = h->cabac.low;
5248     cc.bytestream= h->cabac.bytestream;
5249 #else
5250 #define CC &h->cabac
5251 #endif
5252
5253
5254     /* cat: 0-> DC 16x16  n = 0
5255      *      1-> AC 16x16  n = luma4x4idx
5256      *      2-> Luma4x4   n = luma4x4idx
5257      *      3-> DC Chroma n = iCbCr
5258      *      4-> AC Chroma n = 16 + 4 * iCbCr + chroma4x4idx
5259      *      5-> Luma8x8   n = 4 * luma8x8idx
5260      */
5261
5262     /* read coded block flag */
5263     if( is_dc || cat != 5 ) {
5264         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n, is_dc ) ] ) == 0 ) {
5265             if( !is_dc )
5266                 h->non_zero_count_cache[scan8[n]] = 0;
5267
5268 #ifdef CABAC_ON_STACK
5269             h->cabac.range     = cc.range     ;
5270             h->cabac.low       = cc.low       ;
5271             h->cabac.bytestream= cc.bytestream;
5272 #endif
5273             return;
5274         }
5275     }
5276
5277     significant_coeff_ctx_base = h->cabac_state
5278         + significant_coeff_flag_offset[MB_FIELD][cat];
5279     last_coeff_ctx_base = h->cabac_state
5280         + last_coeff_flag_offset[MB_FIELD][cat];
5281     abs_level_m1_ctx_base = h->cabac_state
5282         + coeff_abs_level_m1_offset[cat];
5283
5284     if( !is_dc && cat == 5 ) {
5285 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5286         for(last= 0; last < coefs; last++) { \
5287             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5288             if( get_cabac( CC, sig_ctx )) { \
5289                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5290                 index[coeff_count++] = last; \
5291                 if( get_cabac( CC, last_ctx ) ) { \
5292                     last= max_coeff; \
5293                     break; \
5294                 } \
5295             } \
5296         }\
5297         if( last == max_coeff -1 ) {\
5298             index[coeff_count++] = last;\
5299         }
5300         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5301 #if ARCH_X86 && HAVE_7REGS && HAVE_EBX_AVAILABLE && !defined(BROKEN_RELOCATIONS)
5302         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5303     } else {
5304         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5305 #else
5306         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5307     } else {
5308         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5309 #endif
5310     }
5311     assert(coeff_count > 0);
5312
5313     if( is_dc ) {
5314         if( cat == 0 )
5315             h->cbp_table[h->mb_xy] |= 0x100;
5316         else
5317             h->cbp_table[h->mb_xy] |= 0x40 << n;
5318     } else {
5319         if( cat == 5 )
5320             fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5321         else {
5322             assert( cat == 1 || cat == 2 || cat == 4 );
5323             h->non_zero_count_cache[scan8[n]] = coeff_count;
5324         }
5325     }
5326
5327     do {
5328         uint8_t *ctx = coeff_abs_level1_ctx[node_ctx] + abs_level_m1_ctx_base;
5329
5330         int j= scantable[index[--coeff_count]];
5331
5332         if( get_cabac( CC, ctx ) == 0 ) {
5333             node_ctx = coeff_abs_level_transition[0][node_ctx];
5334             if( is_dc ) {
5335                 block[j] = get_cabac_bypass_sign( CC, -1);
5336             }else{
5337                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5338             }
5339         } else {
5340             int coeff_abs = 2;
5341             ctx = coeff_abs_levelgt1_ctx[node_ctx] + abs_level_m1_ctx_base;
5342             node_ctx = coeff_abs_level_transition[1][node_ctx];
5343
5344             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5345                 coeff_abs++;
5346             }
5347
5348             if( coeff_abs >= 15 ) {
5349                 int j = 0;
5350                 while( get_cabac_bypass( CC ) ) {
5351                     j++;
5352                 }
5353
5354                 coeff_abs=1;
5355                 while( j-- ) {
5356                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5357                 }
5358                 coeff_abs+= 14;
5359             }
5360
5361             if( is_dc ) {
5362                 block[j] = get_cabac_bypass_sign( CC, -coeff_abs );
5363             }else{
5364                 block[j] = (get_cabac_bypass_sign( CC, -coeff_abs ) * qmul[j] + 32) >> 6;
5365             }
5366         }
5367     } while( coeff_count );
5368 #ifdef CABAC_ON_STACK
5369             h->cabac.range     = cc.range     ;
5370             h->cabac.low       = cc.low       ;
5371             h->cabac.bytestream= cc.bytestream;
5372 #endif
5373
5374 }
5375
5376 #if !CONFIG_SMALL
5377 static void decode_cabac_residual_dc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5378     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 1);
5379 }
5380
5381 static void decode_cabac_residual_nondc( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5382     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, 0);
5383 }
5384 #endif
5385
5386 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff ) {
5387 #if CONFIG_SMALL
5388     decode_cabac_residual_internal(h, block, cat, n, scantable, qmul, max_coeff, cat == 0 || cat == 3);
5389 #else
5390     if( cat == 0 || cat == 3 ) decode_cabac_residual_dc(h, block, cat, n, scantable, qmul, max_coeff);
5391     else decode_cabac_residual_nondc(h, block, cat, n, scantable, qmul, max_coeff);
5392 #endif
5393 }
5394
5395 static inline void compute_mb_neighbors(H264Context *h)
5396 {
5397     MpegEncContext * const s = &h->s;
5398     const int mb_xy  = h->mb_xy;
5399     h->top_mb_xy     = mb_xy - s->mb_stride;
5400     h->left_mb_xy[0] = mb_xy - 1;
5401     if(FRAME_MBAFF){
5402         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5403         const int top_pair_xy      = pair_xy     - s->mb_stride;
5404         const int top_mb_field_flag  = IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5405         const int left_mb_field_flag = IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5406         const int curr_mb_field_flag = MB_FIELD;
5407         const int bottom = (s->mb_y & 1);
5408
5409         if (curr_mb_field_flag && (bottom || top_mb_field_flag)){
5410             h->top_mb_xy -= s->mb_stride;
5411         }
5412         if (!left_mb_field_flag == curr_mb_field_flag) {
5413             h->left_mb_xy[0] = pair_xy - 1;
5414         }
5415     } else if (FIELD_PICTURE) {
5416         h->top_mb_xy -= s->mb_stride;
5417     }
5418     return;
5419 }
5420
5421 /**
5422  * decodes a macroblock
5423  * @returns 0 if OK, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5424  */
5425 static int decode_mb_cabac(H264Context *h) {
5426     MpegEncContext * const s = &h->s;
5427     int mb_xy;
5428     int mb_type, partition_count, cbp = 0;
5429     int dct8x8_allowed= h->pps.transform_8x8_mode;
5430
5431     mb_xy = h->mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5432
5433     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5434     if( h->slice_type_nos != FF_I_TYPE ) {
5435         int skip;
5436         /* a skipped mb needs the aff flag from the following mb */
5437         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5438             predict_field_decoding_flag(h);
5439         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5440             skip = h->next_mb_skipped;
5441         else
5442             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5443         /* read skip flags */
5444         if( skip ) {
5445             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5446                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5447                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5448                 if(!h->next_mb_skipped)
5449                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5450             }
5451
5452             decode_mb_skip(h);
5453
5454             h->cbp_table[mb_xy] = 0;
5455             h->chroma_pred_mode_table[mb_xy] = 0;
5456             h->last_qscale_diff = 0;
5457
5458             return 0;
5459
5460         }
5461     }
5462     if(FRAME_MBAFF){
5463         if( (s->mb_y&1) == 0 )
5464             h->mb_mbaff =
5465             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5466     }
5467
5468     h->prev_mb_skipped = 0;
5469
5470     compute_mb_neighbors(h);
5471
5472     if( h->slice_type_nos == FF_B_TYPE ) {
5473         mb_type = decode_cabac_mb_type_b( h );
5474         if( mb_type < 23 ){
5475             partition_count= b_mb_type_info[mb_type].partition_count;
5476             mb_type=         b_mb_type_info[mb_type].type;
5477         }else{
5478             mb_type -= 23;
5479             goto decode_intra_mb;
5480         }
5481     } else if( h->slice_type_nos == FF_P_TYPE ) {
5482         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
5483             /* P-type */
5484             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
5485                 /* P_L0_D16x16, P_8x8 */
5486                 mb_type= 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
5487             } else {
5488                 /* P_L0_D8x16, P_L0_D16x8 */
5489                 mb_type= 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
5490             }
5491             partition_count= p_mb_type_info[mb_type].partition_count;
5492             mb_type=         p_mb_type_info[mb_type].type;
5493         } else {
5494             mb_type= decode_cabac_intra_mb_type(h, 17, 0);
5495             goto decode_intra_mb;
5496         }
5497     } else {
5498         mb_type= decode_cabac_intra_mb_type(h, 3, 1);
5499         if(h->slice_type == FF_SI_TYPE && mb_type)
5500             mb_type--;
5501         assert(h->slice_type_nos == FF_I_TYPE);
5502 decode_intra_mb:
5503         partition_count = 0;
5504         cbp= i_mb_type_info[mb_type].cbp;
5505         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5506         mb_type= i_mb_type_info[mb_type].type;
5507     }
5508     if(MB_FIELD)
5509         mb_type |= MB_TYPE_INTERLACED;
5510
5511     h->slice_table[ mb_xy ]= h->slice_num;
5512
5513     if(IS_INTRA_PCM(mb_type)) {
5514         const uint8_t *ptr;
5515
5516         // We assume these blocks are very rare so we do not optimize it.
5517         // FIXME The two following lines get the bitstream position in the cabac
5518         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5519         ptr= h->cabac.bytestream;
5520         if(h->cabac.low&0x1) ptr--;
5521         if(CABAC_BITS==16){
5522             if(h->cabac.low&0x1FF) ptr--;
5523         }
5524
5525         // The pixels are stored in the same order as levels in h->mb array.
5526         memcpy(h->mb, ptr, 256); ptr+=256;
5527         if(CHROMA){
5528             memcpy(h->mb+128, ptr, 128); ptr+=128;
5529         }
5530
5531         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5532
5533         // All blocks are present
5534         h->cbp_table[mb_xy] = 0x1ef;
5535         h->chroma_pred_mode_table[mb_xy] = 0;
5536         // In deblocking, the quantizer is 0
5537         s->current_picture.qscale_table[mb_xy]= 0;
5538         // All coeffs are present
5539         memset(h->non_zero_count[mb_xy], 16, 16);
5540         s->current_picture.mb_type[mb_xy]= mb_type;
5541         h->last_qscale_diff = 0;
5542         return 0;
5543     }
5544
5545     if(MB_MBAFF){
5546         h->ref_count[0] <<= 1;
5547         h->ref_count[1] <<= 1;
5548     }
5549
5550     fill_caches(h, mb_type, 0);
5551
5552     if( IS_INTRA( mb_type ) ) {
5553         int i, pred_mode;
5554         if( IS_INTRA4x4( mb_type ) ) {
5555             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5556                 mb_type |= MB_TYPE_8x8DCT;
5557                 for( i = 0; i < 16; i+=4 ) {
5558                     int pred = pred_intra_mode( h, i );
5559                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5560                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5561                 }
5562             } else {
5563                 for( i = 0; i < 16; i++ ) {
5564                     int pred = pred_intra_mode( h, i );
5565                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5566
5567                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5568                 }
5569             }
5570             write_back_intra_pred_mode(h);
5571             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5572         } else {
5573             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5574             if( h->intra16x16_pred_mode < 0 ) return -1;
5575         }
5576         if(CHROMA){
5577             h->chroma_pred_mode_table[mb_xy] =
5578             pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5579
5580             pred_mode= check_intra_pred_mode( h, pred_mode );
5581             if( pred_mode < 0 ) return -1;
5582             h->chroma_pred_mode= pred_mode;
5583         }
5584     } else if( partition_count == 4 ) {
5585         int i, j, sub_partition_count[4], list, ref[2][4];
5586
5587         if( h->slice_type_nos == FF_B_TYPE ) {
5588             for( i = 0; i < 4; i++ ) {
5589                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5590                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5591                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5592             }
5593             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5594                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5595                 pred_direct_motion(h, &mb_type);
5596                 h->ref_cache[0][scan8[4]] =
5597                 h->ref_cache[1][scan8[4]] =
5598                 h->ref_cache[0][scan8[12]] =
5599                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5600                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5601                     for( i = 0; i < 4; i++ )
5602                         if( IS_DIRECT(h->sub_mb_type[i]) )
5603                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5604                 }
5605             }
5606         } else {
5607             for( i = 0; i < 4; i++ ) {
5608                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5609                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5610                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5611             }
5612         }
5613
5614         for( list = 0; list < h->list_count; list++ ) {
5615                 for( i = 0; i < 4; i++ ) {
5616                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5617                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5618                         if( h->ref_count[list] > 1 ){
5619                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5620                             if(ref[list][i] >= (unsigned)h->ref_count[list]){
5621                                 av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref[list][i], h->ref_count[list]);
5622                                 return -1;
5623                             }
5624                         }else
5625                             ref[list][i] = 0;
5626                     } else {
5627                         ref[list][i] = -1;
5628                     }
5629                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5630                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5631                 }
5632         }
5633
5634         if(dct8x8_allowed)
5635             dct8x8_allowed = get_dct8x8_allowed(h);
5636
5637         for(list=0; list<h->list_count; list++){
5638             for(i=0; i<4; i++){
5639                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5640                 if(IS_DIRECT(h->sub_mb_type[i])){
5641                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5642                     continue;
5643                 }
5644
5645                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5646                     const int sub_mb_type= h->sub_mb_type[i];
5647                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5648                     for(j=0; j<sub_partition_count[i]; j++){
5649                         int mpx, mpy;
5650                         int mx, my;
5651                         const int index= 4*i + block_width*j;
5652                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5653                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5654                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5655
5656                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5657                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5658                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5659
5660                         if(IS_SUB_8X8(sub_mb_type)){
5661                             mv_cache[ 1 ][0]=
5662                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5663                             mv_cache[ 1 ][1]=
5664                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5665
5666                             mvd_cache[ 1 ][0]=
5667                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5668                             mvd_cache[ 1 ][1]=
5669                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5670                         }else if(IS_SUB_8X4(sub_mb_type)){
5671                             mv_cache[ 1 ][0]= mx;
5672                             mv_cache[ 1 ][1]= my;
5673
5674                             mvd_cache[ 1 ][0]= mx - mpx;
5675                             mvd_cache[ 1 ][1]= my - mpy;
5676                         }else if(IS_SUB_4X8(sub_mb_type)){
5677                             mv_cache[ 8 ][0]= mx;
5678                             mv_cache[ 8 ][1]= my;
5679
5680                             mvd_cache[ 8 ][0]= mx - mpx;
5681                             mvd_cache[ 8 ][1]= my - mpy;
5682                         }
5683                         mv_cache[ 0 ][0]= mx;
5684                         mv_cache[ 0 ][1]= my;
5685
5686                         mvd_cache[ 0 ][0]= mx - mpx;
5687                         mvd_cache[ 0 ][1]= my - mpy;
5688                     }
5689                 }else{
5690                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5691                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5692                     p[0] = p[1] = p[8] = p[9] = 0;
5693                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5694                 }
5695             }
5696         }
5697     } else if( IS_DIRECT(mb_type) ) {
5698         pred_direct_motion(h, &mb_type);
5699         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5700         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5701         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5702     } else {
5703         int list, mx, my, i, mpx, mpy;
5704         if(IS_16X16(mb_type)){
5705             for(list=0; list<h->list_count; list++){
5706                 if(IS_DIR(mb_type, 0, list)){
5707                     int ref;
5708                     if(h->ref_count[list] > 1){
5709                         ref= decode_cabac_mb_ref(h, list, 0);
5710                         if(ref >= (unsigned)h->ref_count[list]){
5711                             av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5712                             return -1;
5713                         }
5714                     }else
5715                         ref=0;
5716                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5717                 }else
5718                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5719             }
5720             for(list=0; list<h->list_count; list++){
5721                 if(IS_DIR(mb_type, 0, list)){
5722                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5723
5724                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5725                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5726                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5727
5728                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5729                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5730                 }else
5731                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5732             }
5733         }
5734         else if(IS_16X8(mb_type)){
5735             for(list=0; list<h->list_count; list++){
5736                     for(i=0; i<2; i++){
5737                         if(IS_DIR(mb_type, i, list)){
5738                             int ref;
5739                             if(h->ref_count[list] > 1){
5740                                 ref= decode_cabac_mb_ref( h, list, 8*i );
5741                                 if(ref >= (unsigned)h->ref_count[list]){
5742                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5743                                     return -1;
5744                                 }
5745                             }else
5746                                 ref=0;
5747                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5748                         }else
5749                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5750                     }
5751             }
5752             for(list=0; list<h->list_count; list++){
5753                 for(i=0; i<2; i++){
5754                     if(IS_DIR(mb_type, i, list)){
5755                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5756                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5757                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5758                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5759
5760                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5761                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5762                     }else{
5763                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5764                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5765                     }
5766                 }
5767             }
5768         }else{
5769             assert(IS_8X16(mb_type));
5770             for(list=0; list<h->list_count; list++){
5771                     for(i=0; i<2; i++){
5772                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5773                             int ref;
5774                             if(h->ref_count[list] > 1){
5775                                 ref= decode_cabac_mb_ref( h, list, 4*i );
5776                                 if(ref >= (unsigned)h->ref_count[list]){
5777                                     av_log(s->avctx, AV_LOG_ERROR, "Reference %d >= %d\n", ref, h->ref_count[list]);
5778                                     return -1;
5779                                 }
5780                             }else
5781                                 ref=0;
5782                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5783                         }else
5784                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5785                     }
5786             }
5787             for(list=0; list<h->list_count; list++){
5788                 for(i=0; i<2; i++){
5789                     if(IS_DIR(mb_type, i, list)){
5790                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5791                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5792                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5793
5794                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5795                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5796                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5797                     }else{
5798                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5799                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5800                     }
5801                 }
5802             }
5803         }
5804     }
5805
5806    if( IS_INTER( mb_type ) ) {
5807         h->chroma_pred_mode_table[mb_xy] = 0;
5808         write_back_motion( h, mb_type );
5809    }
5810
5811     if( !IS_INTRA16x16( mb_type ) ) {
5812         cbp  = decode_cabac_mb_cbp_luma( h );
5813         if(CHROMA)
5814             cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5815     }
5816
5817     h->cbp_table[mb_xy] = h->cbp = cbp;
5818
5819     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5820         if( decode_cabac_mb_transform_size( h ) )
5821             mb_type |= MB_TYPE_8x8DCT;
5822     }
5823     s->current_picture.mb_type[mb_xy]= mb_type;
5824
5825     if( cbp || IS_INTRA16x16( mb_type ) ) {
5826         const uint8_t *scan, *scan8x8, *dc_scan;
5827         const uint32_t *qmul;
5828         int dqp;
5829
5830         if(IS_INTERLACED(mb_type)){
5831             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5832             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5833             dc_scan= luma_dc_field_scan;
5834         }else{
5835             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5836             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5837             dc_scan= luma_dc_zigzag_scan;
5838         }
5839
5840         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5841         if( dqp == INT_MIN ){
5842             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5843             return -1;
5844         }
5845         s->qscale += dqp;
5846         if(((unsigned)s->qscale) > 51){
5847             if(s->qscale<0) s->qscale+= 52;
5848             else            s->qscale-= 52;
5849         }
5850         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5851         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5852
5853         if( IS_INTRA16x16( mb_type ) ) {
5854             int i;
5855             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5856             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5857
5858             if( cbp&15 ) {
5859                 qmul = h->dequant4_coeff[0][s->qscale];
5860                 for( i = 0; i < 16; i++ ) {
5861                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5862                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
5863                 }
5864             } else {
5865                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5866             }
5867         } else {
5868             int i8x8, i4x4;
5869             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5870                 if( cbp & (1<<i8x8) ) {
5871                     if( IS_8x8DCT(mb_type) ) {
5872                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
5873                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
5874                     } else {
5875                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
5876                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
5877                             const int index = 4*i8x8 + i4x4;
5878                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
5879 //START_TIMER
5880                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
5881 //STOP_TIMER("decode_residual")
5882                         }
5883                     }
5884                 } else {
5885                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5886                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5887                 }
5888             }
5889         }
5890
5891         if( cbp&0x30 ){
5892             int c;
5893             for( c = 0; c < 2; c++ ) {
5894                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
5895                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
5896             }
5897         }
5898
5899         if( cbp&0x20 ) {
5900             int c, i;
5901             for( c = 0; c < 2; c++ ) {
5902                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
5903                 for( i = 0; i < 4; i++ ) {
5904                     const int index = 16 + 4 * c + i;
5905                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
5906                     decode_cabac_residual(h, h->mb + 16*index, 4, index, scan + 1, qmul, 15);
5907                 }
5908             }
5909         } else {
5910             uint8_t * const nnz= &h->non_zero_count_cache[0];
5911             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5912             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5913         }
5914     } else {
5915         uint8_t * const nnz= &h->non_zero_count_cache[0];
5916         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5917         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5918         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5919         h->last_qscale_diff = 0;
5920     }
5921
5922     s->current_picture.qscale_table[mb_xy]= s->qscale;
5923     write_back_non_zero_count(h);
5924
5925     if(MB_MBAFF){
5926         h->ref_count[0] >>= 1;
5927         h->ref_count[1] >>= 1;
5928     }
5929
5930     return 0;
5931 }
5932
5933
5934 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5935     const int index_a = qp + h->slice_alpha_c0_offset;
5936     const int alpha = (alpha_table+52)[index_a];
5937     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5938
5939     if( bS[0] < 4 ) {
5940         int8_t tc[4];
5941         tc[0] = (tc0_table+52)[index_a][bS[0]];
5942         tc[1] = (tc0_table+52)[index_a][bS[1]];
5943         tc[2] = (tc0_table+52)[index_a][bS[2]];
5944         tc[3] = (tc0_table+52)[index_a][bS[3]];
5945         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
5946     } else {
5947         h->s.dsp.h264_h_loop_filter_luma_intra(pix, stride, alpha, beta);
5948     }
5949 }
5950 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
5951     const int index_a = qp + h->slice_alpha_c0_offset;
5952     const int alpha = (alpha_table+52)[index_a];
5953     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
5954
5955     if( bS[0] < 4 ) {
5956         int8_t tc[4];
5957         tc[0] = (tc0_table+52)[index_a][bS[0]]+1;
5958         tc[1] = (tc0_table+52)[index_a][bS[1]]+1;
5959         tc[2] = (tc0_table+52)[index_a][bS[2]]+1;
5960         tc[3] = (tc0_table+52)[index_a][bS[3]]+1;
5961         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
5962     } else {
5963         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
5964     }
5965 }
5966
5967 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
5968     int i;
5969     for( i = 0; i < 16; i++, pix += stride) {
5970         int index_a;
5971         int alpha;
5972         int beta;
5973
5974         int qp_index;
5975         int bS_index = (i >> 1);
5976         if (!MB_FIELD) {
5977             bS_index &= ~1;
5978             bS_index |= (i & 1);
5979         }
5980
5981         if( bS[bS_index] == 0 ) {
5982             continue;
5983         }
5984
5985         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
5986         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
5987         alpha = (alpha_table+52)[index_a];
5988         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
5989
5990         if( bS[bS_index] < 4 ) {
5991             const int tc0 = (tc0_table+52)[index_a][bS[bS_index]];
5992             const int p0 = pix[-1];
5993             const int p1 = pix[-2];
5994             const int p2 = pix[-3];
5995             const int q0 = pix[0];
5996             const int q1 = pix[1];
5997             const int q2 = pix[2];
5998
5999             if( FFABS( p0 - q0 ) < alpha &&
6000                 FFABS( p1 - p0 ) < beta &&
6001                 FFABS( q1 - q0 ) < beta ) {
6002                 int tc = tc0;
6003                 int i_delta;
6004
6005                 if( FFABS( p2 - p0 ) < beta ) {
6006                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6007                     tc++;
6008                 }
6009                 if( FFABS( q2 - q0 ) < beta ) {
6010                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6011                     tc++;
6012                 }
6013
6014                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6015                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6016                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6017                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6018             }
6019         }else{
6020             const int p0 = pix[-1];
6021             const int p1 = pix[-2];
6022             const int p2 = pix[-3];
6023
6024             const int q0 = pix[0];
6025             const int q1 = pix[1];
6026             const int q2 = pix[2];
6027
6028             if( FFABS( p0 - q0 ) < alpha &&
6029                 FFABS( p1 - p0 ) < beta &&
6030                 FFABS( q1 - q0 ) < beta ) {
6031
6032                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6033                     if( FFABS( p2 - p0 ) < beta)
6034                     {
6035                         const int p3 = pix[-4];
6036                         /* p0', p1', p2' */
6037                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6038                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6039                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6040                     } else {
6041                         /* p0' */
6042                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6043                     }
6044                     if( FFABS( q2 - q0 ) < beta)
6045                     {
6046                         const int q3 = pix[3];
6047                         /* q0', q1', q2' */
6048                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6049                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6050                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6051                     } else {
6052                         /* q0' */
6053                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6054                     }
6055                 }else{
6056                     /* p0', q0' */
6057                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6058                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6059                 }
6060                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6061             }
6062         }
6063     }
6064 }
6065 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6066     int i;
6067     for( i = 0; i < 8; i++, pix += stride) {
6068         int index_a;
6069         int alpha;
6070         int beta;
6071
6072         int qp_index;
6073         int bS_index = i;
6074
6075         if( bS[bS_index] == 0 ) {
6076             continue;
6077         }
6078
6079         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6080         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6081         alpha = (alpha_table+52)[index_a];
6082         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6083
6084         if( bS[bS_index] < 4 ) {
6085             const int tc = (tc0_table+52)[index_a][bS[bS_index]] + 1;
6086             const int p0 = pix[-1];
6087             const int p1 = pix[-2];
6088             const int q0 = pix[0];
6089             const int q1 = pix[1];
6090
6091             if( FFABS( p0 - q0 ) < alpha &&
6092                 FFABS( p1 - p0 ) < beta &&
6093                 FFABS( q1 - q0 ) < beta ) {
6094                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6095
6096                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6097                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6098                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6099             }
6100         }else{
6101             const int p0 = pix[-1];
6102             const int p1 = pix[-2];
6103             const int q0 = pix[0];
6104             const int q1 = pix[1];
6105
6106             if( FFABS( p0 - q0 ) < alpha &&
6107                 FFABS( p1 - p0 ) < beta &&
6108                 FFABS( q1 - q0 ) < beta ) {
6109
6110                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6111                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6112                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6113             }
6114         }
6115     }
6116 }
6117
6118 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6119     const int index_a = qp + h->slice_alpha_c0_offset;
6120     const int alpha = (alpha_table+52)[index_a];
6121     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6122
6123     if( bS[0] < 4 ) {
6124         int8_t tc[4];
6125         tc[0] = (tc0_table+52)[index_a][bS[0]];
6126         tc[1] = (tc0_table+52)[index_a][bS[1]];
6127         tc[2] = (tc0_table+52)[index_a][bS[2]];
6128         tc[3] = (tc0_table+52)[index_a][bS[3]];
6129         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6130     } else {
6131         h->s.dsp.h264_v_loop_filter_luma_intra(pix, stride, alpha, beta);
6132     }
6133 }
6134
6135 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6136     const int index_a = qp + h->slice_alpha_c0_offset;
6137     const int alpha = (alpha_table+52)[index_a];
6138     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6139
6140     if( bS[0] < 4 ) {
6141         int8_t tc[4];
6142         tc[0] = (tc0_table+52)[index_a][bS[0]]+1;
6143         tc[1] = (tc0_table+52)[index_a][bS[1]]+1;
6144         tc[2] = (tc0_table+52)[index_a][bS[2]]+1;
6145         tc[3] = (tc0_table+52)[index_a][bS[3]]+1;
6146         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6147     } else {
6148         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6149     }
6150 }
6151
6152 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6153     MpegEncContext * const s = &h->s;
6154     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6155     int mb_xy, mb_type;
6156     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6157
6158     mb_xy = h->mb_xy;
6159
6160     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6161         !(s->flags2 & CODEC_FLAG2_FAST) || //FIXME filter_mb_fast is broken, thus hasto be, but should not under CODEC_FLAG2_FAST
6162        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6163                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6164         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6165         return;
6166     }
6167     assert(!FRAME_MBAFF);
6168
6169     mb_type = s->current_picture.mb_type[mb_xy];
6170     qp = s->current_picture.qscale_table[mb_xy];
6171     qp0 = s->current_picture.qscale_table[mb_xy-1];
6172     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6173     qpc = get_chroma_qp( h, 0, qp );
6174     qpc0 = get_chroma_qp( h, 0, qp0 );
6175     qpc1 = get_chroma_qp( h, 0, qp1 );
6176     qp0 = (qp + qp0 + 1) >> 1;
6177     qp1 = (qp + qp1 + 1) >> 1;
6178     qpc0 = (qpc + qpc0 + 1) >> 1;
6179     qpc1 = (qpc + qpc1 + 1) >> 1;
6180     qp_thresh = 15 - h->slice_alpha_c0_offset;
6181     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6182        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6183         return;
6184
6185     if( IS_INTRA(mb_type) ) {
6186         int16_t bS4[4] = {4,4,4,4};
6187         int16_t bS3[4] = {3,3,3,3};
6188         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6189         if( IS_8x8DCT(mb_type) ) {
6190             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6191             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6192             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6193             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6194         } else {
6195             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6196             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6197             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6198             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6199             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6200             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6201             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6202             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6203         }
6204         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6205         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6206         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6207         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6208         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6209         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6210         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6211         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6212         return;
6213     } else {
6214         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6215         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6216         int edges;
6217         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6218             edges = 4;
6219             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6220         } else {
6221             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6222                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6223             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6224                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6225                              ? 3 : 0;
6226             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6227             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6228             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6229                                               (h->slice_type_nos == FF_B_TYPE), edges, step, mask_edge0, mask_edge1, FIELD_PICTURE);
6230         }
6231         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6232             bSv[0][0] = 0x0004000400040004ULL;
6233         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6234             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6235
6236 #define FILTER(hv,dir,edge)\
6237         if(bSv[dir][edge]) {\
6238             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6239             if(!(edge&1)) {\
6240                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6241                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6242             }\
6243         }
6244         if( edges == 1 ) {
6245             FILTER(v,0,0);
6246             FILTER(h,1,0);
6247         } else if( IS_8x8DCT(mb_type) ) {
6248             FILTER(v,0,0);
6249             FILTER(v,0,2);
6250             FILTER(h,1,0);
6251             FILTER(h,1,2);
6252         } else {
6253             FILTER(v,0,0);
6254             FILTER(v,0,1);
6255             FILTER(v,0,2);
6256             FILTER(v,0,3);
6257             FILTER(h,1,0);
6258             FILTER(h,1,1);
6259             FILTER(h,1,2);
6260             FILTER(h,1,3);
6261         }
6262 #undef FILTER
6263     }
6264 }
6265
6266
6267 static void av_always_inline filter_mb_dir(H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize, int mb_xy, int mb_type, int mvy_limit, int first_vertical_edge_done, int dir) {
6268     MpegEncContext * const s = &h->s;
6269     int edge;
6270     const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6271     const int mbm_type = s->current_picture.mb_type[mbm_xy];
6272     int (*ref2frm) [64] = h->ref2frm[ h->slice_num          &(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6273     int (*ref2frmm)[64] = h->ref2frm[ h->slice_table[mbm_xy]&(MAX_SLICES-1) ][0] + (MB_MBAFF ? 20 : 2);
6274     int start = h->slice_table[mbm_xy] == 0xFFFF ? 1 : 0;
6275
6276     const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6277                               == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6278     // how often to recheck mv-based bS when iterating between edges
6279     const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6280                           (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6281     // how often to recheck mv-based bS when iterating along each edge
6282     const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6283
6284     if (first_vertical_edge_done) {
6285         start = 1;
6286     }
6287
6288     if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6289         start = 1;
6290
6291     if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6292         && !IS_INTERLACED(mb_type)
6293         && IS_INTERLACED(mbm_type)
6294         ) {
6295         // This is a special case in the norm where the filtering must
6296         // be done twice (one each of the field) even if we are in a
6297         // frame macroblock.
6298         //
6299         static const int nnz_idx[4] = {4,5,6,3};
6300         unsigned int tmp_linesize   = 2 *   linesize;
6301         unsigned int tmp_uvlinesize = 2 * uvlinesize;
6302         int mbn_xy = mb_xy - 2 * s->mb_stride;
6303         int qp;
6304         int i, j;
6305         int16_t bS[4];
6306
6307         for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6308             if( IS_INTRA(mb_type) ||
6309                 IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6310                 bS[0] = bS[1] = bS[2] = bS[3] = 3;
6311             } else {
6312                 const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6313                 for( i = 0; i < 4; i++ ) {
6314                     if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6315                         mbn_nnz[nnz_idx[i]] != 0 )
6316                         bS[i] = 2;
6317                     else
6318                         bS[i] = 1;
6319                 }
6320             }
6321             // Do not use s->qscale as luma quantizer because it has not the same
6322             // value in IPCM macroblocks.
6323             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6324             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6325             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6326             filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6327             filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6328                               ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6329             filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6330                               ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6331         }
6332
6333         start = 1;
6334     }
6335
6336     /* Calculate bS */
6337     for( edge = start; edge < edges; edge++ ) {
6338         /* mbn_xy: neighbor macroblock */
6339         const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6340         const int mbn_type = s->current_picture.mb_type[mbn_xy];
6341         int (*ref2frmn)[64] = edge > 0 ? ref2frm : ref2frmm;
6342         int16_t bS[4];
6343         int qp;
6344
6345         if( (edge&1) && IS_8x8DCT(mb_type) )
6346             continue;
6347
6348         if( IS_INTRA(mb_type) ||
6349             IS_INTRA(mbn_type) ) {
6350             int value;
6351             if (edge == 0) {
6352                 if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6353                     || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6354                 ) {
6355                     value = 4;
6356                 } else {
6357                     value = 3;
6358                 }
6359             } else {
6360                 value = 3;
6361             }
6362             bS[0] = bS[1] = bS[2] = bS[3] = value;
6363         } else {
6364             int i, l;
6365             int mv_done;
6366
6367             if( edge & mask_edge ) {
6368                 bS[0] = bS[1] = bS[2] = bS[3] = 0;
6369                 mv_done = 1;
6370             }
6371             else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6372                 bS[0] = bS[1] = bS[2] = bS[3] = 1;
6373                 mv_done = 1;
6374             }
6375             else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6376                 int b_idx= 8 + 4 + edge * (dir ? 8:1);
6377                 int bn_idx= b_idx - (dir ? 8:1);
6378                 int v = 0;
6379
6380                 for( l = 0; !v && l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6381                     v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6382                          FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6383                          FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6384                 }
6385
6386                 if(h->slice_type_nos == FF_B_TYPE && v){
6387                     v=0;
6388                     for( l = 0; !v && l < 2; l++ ) {
6389                         int ln= 1-l;
6390                         v |= ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6391                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6392                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit;
6393                     }
6394                 }
6395
6396                 bS[0] = bS[1] = bS[2] = bS[3] = v;
6397                 mv_done = 1;
6398             }
6399             else
6400                 mv_done = 0;
6401
6402             for( i = 0; i < 4; i++ ) {
6403                 int x = dir == 0 ? edge : i;
6404                 int y = dir == 0 ? i    : edge;
6405                 int b_idx= 8 + 4 + x + 8*y;
6406                 int bn_idx= b_idx - (dir ? 8:1);
6407
6408                 if( h->non_zero_count_cache[b_idx] |
6409                     h->non_zero_count_cache[bn_idx] ) {
6410                     bS[i] = 2;
6411                 }
6412                 else if(!mv_done)
6413                 {
6414                     bS[i] = 0;
6415                     for( l = 0; l < 1 + (h->slice_type_nos == FF_B_TYPE); l++ ) {
6416                         if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[l][h->ref_cache[l][bn_idx]] ||
6417                             FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6418                             FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6419                             bS[i] = 1;
6420                             break;
6421                         }
6422                     }
6423
6424                     if(h->slice_type_nos == FF_B_TYPE && bS[i]){
6425                         bS[i] = 0;
6426                         for( l = 0; l < 2; l++ ) {
6427                             int ln= 1-l;
6428                             if( ref2frm[l][h->ref_cache[l][b_idx]] != ref2frmn[ln][h->ref_cache[ln][bn_idx]] ||
6429                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[ln][bn_idx][0] ) >= 4 ||
6430                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[ln][bn_idx][1] ) >= mvy_limit ) {
6431                                 bS[i] = 1;
6432                                 break;
6433                             }
6434                         }
6435                     }
6436                 }
6437             }
6438
6439             if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6440                 continue;
6441         }
6442
6443         /* Filter edge */
6444         // Do not use s->qscale as luma quantizer because it has not the same
6445         // value in IPCM macroblocks.
6446         qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6447         //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6448         tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6449         { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6450         if( dir == 0 ) {
6451             filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6452             if( (edge&1) == 0 ) {
6453                 filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6454                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6455                 filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6456                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6457             }
6458         } else {
6459             filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6460             if( (edge&1) == 0 ) {
6461                 filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6462                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6463                 filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6464                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6465             }
6466         }
6467     }
6468 }
6469
6470 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6471     MpegEncContext * const s = &h->s;
6472     const int mb_xy= mb_x + mb_y*s->mb_stride;
6473     const int mb_type = s->current_picture.mb_type[mb_xy];
6474     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6475     int first_vertical_edge_done = 0;
6476     av_unused int dir;
6477
6478     //for sufficiently low qp, filtering wouldn't do anything
6479     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6480     if(!FRAME_MBAFF){
6481         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX3(0, h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]);
6482         int qp = s->current_picture.qscale_table[mb_xy];
6483         if(qp <= qp_thresh
6484            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6485            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6486             return;
6487         }
6488     }
6489
6490     // CAVLC 8x8dct requires NNZ values for residual decoding that differ from what the loop filter needs
6491     if(!h->pps.cabac && h->pps.transform_8x8_mode){
6492         int top_type, left_type[2];
6493         top_type     = s->current_picture.mb_type[h->top_mb_xy]    ;
6494         left_type[0] = s->current_picture.mb_type[h->left_mb_xy[0]];
6495         left_type[1] = s->current_picture.mb_type[h->left_mb_xy[1]];
6496
6497         if(IS_8x8DCT(top_type)){
6498             h->non_zero_count_cache[4+8*0]=
6499             h->non_zero_count_cache[5+8*0]= h->cbp_table[h->top_mb_xy] & 4;
6500             h->non_zero_count_cache[6+8*0]=
6501             h->non_zero_count_cache[7+8*0]= h->cbp_table[h->top_mb_xy] & 8;
6502         }
6503         if(IS_8x8DCT(left_type[0])){
6504             h->non_zero_count_cache[3+8*1]=
6505             h->non_zero_count_cache[3+8*2]= h->cbp_table[h->left_mb_xy[0]]&2; //FIXME check MBAFF
6506         }
6507         if(IS_8x8DCT(left_type[1])){
6508             h->non_zero_count_cache[3+8*3]=
6509             h->non_zero_count_cache[3+8*4]= h->cbp_table[h->left_mb_xy[1]]&8; //FIXME check MBAFF
6510         }
6511
6512         if(IS_8x8DCT(mb_type)){
6513             h->non_zero_count_cache[scan8[0   ]]= h->non_zero_count_cache[scan8[1   ]]=
6514             h->non_zero_count_cache[scan8[2   ]]= h->non_zero_count_cache[scan8[3   ]]= h->cbp & 1;
6515
6516             h->non_zero_count_cache[scan8[0+ 4]]= h->non_zero_count_cache[scan8[1+ 4]]=
6517             h->non_zero_count_cache[scan8[2+ 4]]= h->non_zero_count_cache[scan8[3+ 4]]= h->cbp & 2;
6518
6519             h->non_zero_count_cache[scan8[0+ 8]]= h->non_zero_count_cache[scan8[1+ 8]]=
6520             h->non_zero_count_cache[scan8[2+ 8]]= h->non_zero_count_cache[scan8[3+ 8]]= h->cbp & 4;
6521
6522             h->non_zero_count_cache[scan8[0+12]]= h->non_zero_count_cache[scan8[1+12]]=
6523             h->non_zero_count_cache[scan8[2+12]]= h->non_zero_count_cache[scan8[3+12]]= h->cbp & 8;
6524         }
6525     }
6526
6527     if (FRAME_MBAFF
6528             // left mb is in picture
6529             && h->slice_table[mb_xy-1] != 0xFFFF
6530             // and current and left pair do not have the same interlaced type
6531             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6532             // and left mb is in the same slice if deblocking_filter == 2
6533             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6534         /* First vertical edge is different in MBAFF frames
6535          * There are 8 different bS to compute and 2 different Qp
6536          */
6537         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6538         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6539         int16_t bS[8];
6540         int qp[2];
6541         int bqp[2];
6542         int rqp[2];
6543         int mb_qp, mbn0_qp, mbn1_qp;
6544         int i;
6545         first_vertical_edge_done = 1;
6546
6547         if( IS_INTRA(mb_type) )
6548             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6549         else {
6550             for( i = 0; i < 8; i++ ) {
6551                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6552
6553                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6554                     bS[i] = 4;
6555                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6556                          ((!h->pps.cabac && IS_8x8DCT(s->current_picture.mb_type[mbn_xy])) ?
6557                             (h->cbp_table[mbn_xy] & ((MB_FIELD ? (i&2) : (mb_y&1)) ? 8 : 2))
6558                                                                        :
6559                             h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2]))
6560                     bS[i] = 2;
6561                 else
6562                     bS[i] = 1;
6563             }
6564         }
6565
6566         mb_qp = s->current_picture.qscale_table[mb_xy];
6567         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6568         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6569         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6570         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6571                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6572         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6573                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6574         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6575         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6576                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6577         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6578                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6579
6580         /* Filter edge */
6581         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6582         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6583         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6584         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6585         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6586     }
6587
6588 #if CONFIG_SMALL
6589     for( dir = 0; dir < 2; dir++ )
6590         filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, dir ? 0 : first_vertical_edge_done, dir);
6591 #else
6592     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, first_vertical_edge_done, 0);
6593     filter_mb_dir(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize, mb_xy, mb_type, mvy_limit, 0, 1);
6594 #endif
6595 }
6596
6597 static int decode_slice(struct AVCodecContext *avctx, void *arg){
6598     H264Context *h = *(void**)arg;
6599     MpegEncContext * const s = &h->s;
6600     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6601
6602     s->mb_skip_run= -1;
6603
6604     h->is_complex = FRAME_MBAFF || s->picture_structure != PICT_FRAME || s->codec_id != CODEC_ID_H264 ||
6605                     (CONFIG_GRAY && (s->flags&CODEC_FLAG_GRAY));
6606
6607     if( h->pps.cabac ) {
6608         int i;
6609
6610         /* realign */
6611         align_get_bits( &s->gb );
6612
6613         /* init cabac */
6614         ff_init_cabac_states( &h->cabac);
6615         ff_init_cabac_decoder( &h->cabac,
6616                                s->gb.buffer + get_bits_count(&s->gb)/8,
6617                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6618         /* calculate pre-state */
6619         for( i= 0; i < 460; i++ ) {
6620             int pre;
6621             if( h->slice_type_nos == FF_I_TYPE )
6622                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6623             else
6624                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6625
6626             if( pre <= 63 )
6627                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6628             else
6629                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6630         }
6631
6632         for(;;){
6633 //START_TIMER
6634             int ret = decode_mb_cabac(h);
6635             int eos;
6636 //STOP_TIMER("decode_mb_cabac")
6637
6638             if(ret>=0) hl_decode_mb(h);
6639
6640             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6641                 s->mb_y++;
6642
6643                 ret = decode_mb_cabac(h);
6644
6645                 if(ret>=0) hl_decode_mb(h);
6646                 s->mb_y--;
6647             }
6648             eos = get_cabac_terminate( &h->cabac );
6649
6650             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6651                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6652                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6653                 return -1;
6654             }
6655
6656             if( ++s->mb_x >= s->mb_width ) {
6657                 s->mb_x = 0;
6658                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6659                 ++s->mb_y;
6660                 if(FIELD_OR_MBAFF_PICTURE) {
6661                     ++s->mb_y;
6662                 }
6663             }
6664
6665             if( eos || s->mb_y >= s->mb_height ) {
6666                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6667                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6668                 return 0;
6669             }
6670         }
6671
6672     } else {
6673         for(;;){
6674             int ret = decode_mb_cavlc(h);
6675
6676             if(ret>=0) hl_decode_mb(h);
6677
6678             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6679                 s->mb_y++;
6680                 ret = decode_mb_cavlc(h);
6681
6682                 if(ret>=0) hl_decode_mb(h);
6683                 s->mb_y--;
6684             }
6685
6686             if(ret<0){
6687                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6688                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6689
6690                 return -1;
6691             }
6692
6693             if(++s->mb_x >= s->mb_width){
6694                 s->mb_x=0;
6695                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6696                 ++s->mb_y;
6697                 if(FIELD_OR_MBAFF_PICTURE) {
6698                     ++s->mb_y;
6699                 }
6700                 if(s->mb_y >= s->mb_height){
6701                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6702
6703                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6704                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6705
6706                         return 0;
6707                     }else{
6708                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6709
6710                         return -1;
6711                     }
6712                 }
6713             }
6714
6715             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6716                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6717                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6718                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6719
6720                     return 0;
6721                 }else{
6722                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6723
6724                     return -1;
6725                 }
6726             }
6727         }
6728     }
6729
6730 #if 0
6731     for(;s->mb_y < s->mb_height; s->mb_y++){
6732         for(;s->mb_x < s->mb_width; s->mb_x++){
6733             int ret= decode_mb(h);
6734
6735             hl_decode_mb(h);
6736
6737             if(ret<0){
6738                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6739                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6740
6741                 return -1;
6742             }
6743
6744             if(++s->mb_x >= s->mb_width){
6745                 s->mb_x=0;
6746                 if(++s->mb_y >= s->mb_height){
6747                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6748                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6749
6750                         return 0;
6751                     }else{
6752                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6753
6754                         return -1;
6755                     }
6756                 }
6757             }
6758
6759             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6760                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6761                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6762
6763                     return 0;
6764                 }else{
6765                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6766
6767                     return -1;
6768                 }
6769             }
6770         }
6771         s->mb_x=0;
6772         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6773     }
6774 #endif
6775     return -1; //not reached
6776 }
6777
6778 static int decode_picture_timing(H264Context *h){
6779     MpegEncContext * const s = &h->s;
6780     if(h->sps.nal_hrd_parameters_present_flag || h->sps.vcl_hrd_parameters_present_flag){
6781         skip_bits(&s->gb, h->sps.cpb_removal_delay_length); /* cpb_removal_delay */
6782         skip_bits(&s->gb, h->sps.dpb_output_delay_length);  /* dpb_output_delay */
6783     }
6784     if(h->sps.pic_struct_present_flag){
6785         unsigned int i, num_clock_ts;
6786         h->sei_pic_struct = get_bits(&s->gb, 4);
6787
6788         if (h->sei_pic_struct > SEI_PIC_STRUCT_FRAME_TRIPLING)
6789             return -1;
6790
6791         num_clock_ts = sei_num_clock_ts_table[h->sei_pic_struct];
6792
6793         for (i = 0 ; i < num_clock_ts ; i++){
6794             if(get_bits(&s->gb, 1)){                  /* clock_timestamp_flag */
6795                 unsigned int full_timestamp_flag;
6796                 skip_bits(&s->gb, 2);                 /* ct_type */
6797                 skip_bits(&s->gb, 1);                 /* nuit_field_based_flag */
6798                 skip_bits(&s->gb, 5);                 /* counting_type */
6799                 full_timestamp_flag = get_bits(&s->gb, 1);
6800                 skip_bits(&s->gb, 1);                 /* discontinuity_flag */
6801                 skip_bits(&s->gb, 1);                 /* cnt_dropped_flag */
6802                 skip_bits(&s->gb, 8);                 /* n_frames */
6803                 if(full_timestamp_flag){
6804                     skip_bits(&s->gb, 6);             /* seconds_value 0..59 */
6805                     skip_bits(&s->gb, 6);             /* minutes_value 0..59 */
6806                     skip_bits(&s->gb, 5);             /* hours_value 0..23 */
6807                 }else{
6808                     if(get_bits(&s->gb, 1)){          /* seconds_flag */
6809                         skip_bits(&s->gb, 6);         /* seconds_value range 0..59 */
6810                         if(get_bits(&s->gb, 1)){      /* minutes_flag */
6811                             skip_bits(&s->gb, 6);     /* minutes_value 0..59 */
6812                             if(get_bits(&s->gb, 1))   /* hours_flag */
6813                                 skip_bits(&s->gb, 5); /* hours_value 0..23 */
6814                         }
6815                     }
6816                 }
6817                 if(h->sps.time_offset_length > 0)
6818                     skip_bits(&s->gb, h->sps.time_offset_length); /* time_offset */
6819             }
6820         }
6821     }
6822     return 0;
6823 }
6824
6825 static int decode_unregistered_user_data(H264Context *h, int size){
6826     MpegEncContext * const s = &h->s;
6827     uint8_t user_data[16+256];
6828     int e, build, i;
6829
6830     if(size<16)
6831         return -1;
6832
6833     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6834         user_data[i]= get_bits(&s->gb, 8);
6835     }
6836
6837     user_data[i]= 0;
6838     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6839     if(e==1 && build>=0)
6840         h->x264_build= build;
6841
6842     if(s->avctx->debug & FF_DEBUG_BUGS)
6843         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6844
6845     for(; i<size; i++)
6846         skip_bits(&s->gb, 8);
6847
6848     return 0;
6849 }
6850
6851 static int decode_sei(H264Context *h){
6852     MpegEncContext * const s = &h->s;
6853
6854     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6855         int size, type;
6856
6857         type=0;
6858         do{
6859             type+= show_bits(&s->gb, 8);
6860         }while(get_bits(&s->gb, 8) == 255);
6861
6862         size=0;
6863         do{
6864             size+= show_bits(&s->gb, 8);
6865         }while(get_bits(&s->gb, 8) == 255);
6866
6867         switch(type){
6868         case SEI_TYPE_PIC_TIMING: // Picture timing SEI
6869             if(decode_picture_timing(h) < 0)
6870                 return -1;
6871             break;
6872         case SEI_TYPE_USER_DATA_UNREGISTERED:
6873             if(decode_unregistered_user_data(h, size) < 0)
6874                 return -1;
6875             break;
6876         default:
6877             skip_bits(&s->gb, 8*size);
6878         }
6879
6880         //FIXME check bits here
6881         align_get_bits(&s->gb);
6882     }
6883
6884     return 0;
6885 }
6886
6887 static inline int decode_hrd_parameters(H264Context *h, SPS *sps){
6888     MpegEncContext * const s = &h->s;
6889     int cpb_count, i;
6890     cpb_count = get_ue_golomb_31(&s->gb) + 1;
6891
6892     if(cpb_count > 32U){
6893         av_log(h->s.avctx, AV_LOG_ERROR, "cpb_count %d invalid\n", cpb_count);
6894         return -1;
6895     }
6896
6897     get_bits(&s->gb, 4); /* bit_rate_scale */
6898     get_bits(&s->gb, 4); /* cpb_size_scale */
6899     for(i=0; i<cpb_count; i++){
6900         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
6901         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
6902         get_bits1(&s->gb);     /* cbr_flag */
6903     }
6904     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
6905     sps->cpb_removal_delay_length = get_bits(&s->gb, 5) + 1;
6906     sps->dpb_output_delay_length = get_bits(&s->gb, 5) + 1;
6907     sps->time_offset_length = get_bits(&s->gb, 5);
6908     return 0;
6909 }
6910
6911 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
6912     MpegEncContext * const s = &h->s;
6913     int aspect_ratio_info_present_flag;
6914     unsigned int aspect_ratio_idc;
6915
6916     aspect_ratio_info_present_flag= get_bits1(&s->gb);
6917
6918     if( aspect_ratio_info_present_flag ) {
6919         aspect_ratio_idc= get_bits(&s->gb, 8);
6920         if( aspect_ratio_idc == EXTENDED_SAR ) {
6921             sps->sar.num= get_bits(&s->gb, 16);
6922             sps->sar.den= get_bits(&s->gb, 16);
6923         }else if(aspect_ratio_idc < FF_ARRAY_ELEMS(pixel_aspect)){
6924             sps->sar=  pixel_aspect[aspect_ratio_idc];
6925         }else{
6926             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
6927             return -1;
6928         }
6929     }else{
6930         sps->sar.num=
6931         sps->sar.den= 0;
6932     }
6933 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
6934
6935     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
6936         get_bits1(&s->gb);      /* overscan_appropriate_flag */
6937     }
6938
6939     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
6940         get_bits(&s->gb, 3);    /* video_format */
6941         get_bits1(&s->gb);      /* video_full_range_flag */
6942         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
6943             get_bits(&s->gb, 8); /* colour_primaries */
6944             get_bits(&s->gb, 8); /* transfer_characteristics */
6945             get_bits(&s->gb, 8); /* matrix_coefficients */
6946         }
6947     }
6948
6949     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
6950         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
6951         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
6952     }
6953
6954     sps->timing_info_present_flag = get_bits1(&s->gb);
6955     if(sps->timing_info_present_flag){
6956         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
6957         sps->time_scale = get_bits_long(&s->gb, 32);
6958         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
6959     }
6960
6961     sps->nal_hrd_parameters_present_flag = get_bits1(&s->gb);
6962     if(sps->nal_hrd_parameters_present_flag)
6963         if(decode_hrd_parameters(h, sps) < 0)
6964             return -1;
6965     sps->vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
6966     if(sps->vcl_hrd_parameters_present_flag)
6967         if(decode_hrd_parameters(h, sps) < 0)
6968             return -1;
6969     if(sps->nal_hrd_parameters_present_flag || sps->vcl_hrd_parameters_present_flag)
6970         get_bits1(&s->gb);     /* low_delay_hrd_flag */
6971     sps->pic_struct_present_flag = get_bits1(&s->gb);
6972
6973     sps->bitstream_restriction_flag = get_bits1(&s->gb);
6974     if(sps->bitstream_restriction_flag){
6975         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
6976         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
6977         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
6978         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
6979         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
6980         sps->num_reorder_frames= get_ue_golomb(&s->gb);
6981         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
6982
6983         if(sps->num_reorder_frames > 16U /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
6984             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", sps->num_reorder_frames);
6985             return -1;
6986         }
6987     }
6988
6989     return 0;
6990 }
6991
6992 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
6993                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
6994     MpegEncContext * const s = &h->s;
6995     int i, last = 8, next = 8;
6996     const uint8_t *scan = size == 16 ? zigzag_scan : ff_zigzag_direct;
6997     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
6998         memcpy(factors, fallback_list, size*sizeof(uint8_t));
6999     else
7000     for(i=0;i<size;i++){
7001         if(next)
7002             next = (last + get_se_golomb(&s->gb)) & 0xff;
7003         if(!i && !next){ /* matrix not written, we use the preset one */
7004             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7005             break;
7006         }
7007         last = factors[scan[i]] = next ? next : last;
7008     }
7009 }
7010
7011 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7012                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7013     MpegEncContext * const s = &h->s;
7014     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7015     const uint8_t *fallback[4] = {
7016         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7017         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7018         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7019         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7020     };
7021     if(get_bits1(&s->gb)){
7022         sps->scaling_matrix_present |= is_sps;
7023         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7024         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7025         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7026         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7027         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7028         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7029         if(is_sps || pps->transform_8x8_mode){
7030             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7031             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7032         }
7033     }
7034 }
7035
7036 static inline int decode_seq_parameter_set(H264Context *h){
7037     MpegEncContext * const s = &h->s;
7038     int profile_idc, level_idc;
7039     unsigned int sps_id;
7040     int i;
7041     SPS *sps;
7042
7043     profile_idc= get_bits(&s->gb, 8);
7044     get_bits1(&s->gb);   //constraint_set0_flag
7045     get_bits1(&s->gb);   //constraint_set1_flag
7046     get_bits1(&s->gb);   //constraint_set2_flag
7047     get_bits1(&s->gb);   //constraint_set3_flag
7048     get_bits(&s->gb, 4); // reserved
7049     level_idc= get_bits(&s->gb, 8);
7050     sps_id= get_ue_golomb_31(&s->gb);
7051
7052     if(sps_id >= MAX_SPS_COUNT) {
7053         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id (%d) out of range\n", sps_id);
7054         return -1;
7055     }
7056     sps= av_mallocz(sizeof(SPS));
7057     if(sps == NULL)
7058         return -1;
7059
7060     sps->profile_idc= profile_idc;
7061     sps->level_idc= level_idc;
7062
7063     memset(sps->scaling_matrix4, 16, sizeof(sps->scaling_matrix4));
7064     memset(sps->scaling_matrix8, 16, sizeof(sps->scaling_matrix8));
7065     sps->scaling_matrix_present = 0;
7066
7067     if(sps->profile_idc >= 100){ //high profile
7068         sps->chroma_format_idc= get_ue_golomb_31(&s->gb);
7069         if(sps->chroma_format_idc == 3)
7070             sps->residual_color_transform_flag = get_bits1(&s->gb);
7071         sps->bit_depth_luma   = get_ue_golomb(&s->gb) + 8;
7072         sps->bit_depth_chroma = get_ue_golomb(&s->gb) + 8;
7073         sps->transform_bypass = get_bits1(&s->gb);
7074         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7075     }else{
7076         sps->chroma_format_idc= 1;
7077     }
7078
7079     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7080     sps->poc_type= get_ue_golomb_31(&s->gb);
7081
7082     if(sps->poc_type == 0){ //FIXME #define
7083         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7084     } else if(sps->poc_type == 1){//FIXME #define
7085         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7086         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7087         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7088         sps->poc_cycle_length                = get_ue_golomb(&s->gb);
7089
7090         if((unsigned)sps->poc_cycle_length >= FF_ARRAY_ELEMS(sps->offset_for_ref_frame)){
7091             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", sps->poc_cycle_length);
7092             goto fail;
7093         }
7094
7095         for(i=0; i<sps->poc_cycle_length; i++)
7096             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7097     }else if(sps->poc_type != 2){
7098         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7099         goto fail;
7100     }
7101
7102     sps->ref_frame_count= get_ue_golomb_31(&s->gb);
7103     if(sps->ref_frame_count > MAX_PICTURE_COUNT-2 || sps->ref_frame_count >= 32U){
7104         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7105         goto fail;
7106     }
7107     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7108     sps->mb_width = get_ue_golomb(&s->gb) + 1;
7109     sps->mb_height= get_ue_golomb(&s->gb) + 1;
7110     if((unsigned)sps->mb_width >= INT_MAX/16 || (unsigned)sps->mb_height >= INT_MAX/16 ||
7111        avcodec_check_dimensions(NULL, 16*sps->mb_width, 16*sps->mb_height)){
7112         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7113         goto fail;
7114     }
7115
7116     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7117     if(!sps->frame_mbs_only_flag)
7118         sps->mb_aff= get_bits1(&s->gb);
7119     else
7120         sps->mb_aff= 0;
7121
7122     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7123
7124 #ifndef ALLOW_INTERLACE
7125     if(sps->mb_aff)
7126         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7127 #endif
7128     sps->crop= get_bits1(&s->gb);
7129     if(sps->crop){
7130         sps->crop_left  = get_ue_golomb(&s->gb);
7131         sps->crop_right = get_ue_golomb(&s->gb);
7132         sps->crop_top   = get_ue_golomb(&s->gb);
7133         sps->crop_bottom= get_ue_golomb(&s->gb);
7134         if(sps->crop_left || sps->crop_top){
7135             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7136         }
7137         if(sps->crop_right >= 8 || sps->crop_bottom >= (8>> !sps->frame_mbs_only_flag)){
7138             av_log(h->s.avctx, AV_LOG_ERROR, "brainfart cropping not supported, this could look slightly wrong ...\n");
7139         }
7140     }else{
7141         sps->crop_left  =
7142         sps->crop_right =
7143         sps->crop_top   =
7144         sps->crop_bottom= 0;
7145     }
7146
7147     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7148     if( sps->vui_parameters_present_flag )
7149         decode_vui_parameters(h, sps);
7150
7151     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7152         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s %s\n",
7153                sps_id, sps->profile_idc, sps->level_idc,
7154                sps->poc_type,
7155                sps->ref_frame_count,
7156                sps->mb_width, sps->mb_height,
7157                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7158                sps->direct_8x8_inference_flag ? "8B8" : "",
7159                sps->crop_left, sps->crop_right,
7160                sps->crop_top, sps->crop_bottom,
7161                sps->vui_parameters_present_flag ? "VUI" : "",
7162                ((const char*[]){"Gray","420","422","444"})[sps->chroma_format_idc]
7163                );
7164     }
7165
7166     av_free(h->sps_buffers[sps_id]);
7167     h->sps_buffers[sps_id]= sps;
7168     return 0;
7169 fail:
7170     av_free(sps);
7171     return -1;
7172 }
7173
7174 static void
7175 build_qp_table(PPS *pps, int t, int index)
7176 {
7177     int i;
7178     for(i = 0; i < 52; i++)
7179         pps->chroma_qp_table[t][i] = chroma_qp[av_clip(i + index, 0, 51)];
7180 }
7181
7182 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7183     MpegEncContext * const s = &h->s;
7184     unsigned int pps_id= get_ue_golomb(&s->gb);
7185     PPS *pps;
7186
7187     if(pps_id >= MAX_PPS_COUNT) {
7188         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id (%d) out of range\n", pps_id);
7189         return -1;
7190     }
7191
7192     pps= av_mallocz(sizeof(PPS));
7193     if(pps == NULL)
7194         return -1;
7195     pps->sps_id= get_ue_golomb_31(&s->gb);
7196     if((unsigned)pps->sps_id>=MAX_SPS_COUNT || h->sps_buffers[pps->sps_id] == NULL){
7197         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7198         goto fail;
7199     }
7200
7201     pps->cabac= get_bits1(&s->gb);
7202     pps->pic_order_present= get_bits1(&s->gb);
7203     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7204     if(pps->slice_group_count > 1 ){
7205         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7206         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7207         switch(pps->mb_slice_group_map_type){
7208         case 0:
7209 #if 0
7210 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7211 |    run_length[ i ]                                |1  |ue(v)   |
7212 #endif
7213             break;
7214         case 2:
7215 #if 0
7216 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7217 |{                                                  |   |        |
7218 |    top_left_mb[ i ]                               |1  |ue(v)   |
7219 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7220 |   }                                               |   |        |
7221 #endif
7222             break;
7223         case 3:
7224         case 4:
7225         case 5:
7226 #if 0
7227 |   slice_group_change_direction_flag               |1  |u(1)    |
7228 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7229 #endif
7230             break;
7231         case 6:
7232 #if 0
7233 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7234 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7235 |)                                                  |   |        |
7236 |    slice_group_id[ i ]                            |1  |u(v)    |
7237 #endif
7238             break;
7239         }
7240     }
7241     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7242     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7243     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7244         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7245         goto fail;
7246     }
7247
7248     pps->weighted_pred= get_bits1(&s->gb);
7249     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7250     pps->init_qp= get_se_golomb(&s->gb) + 26;
7251     pps->init_qs= get_se_golomb(&s->gb) + 26;
7252     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7253     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7254     pps->constrained_intra_pred= get_bits1(&s->gb);
7255     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7256
7257     pps->transform_8x8_mode= 0;
7258     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7259     memcpy(pps->scaling_matrix4, h->sps_buffers[pps->sps_id]->scaling_matrix4, sizeof(pps->scaling_matrix4));
7260     memcpy(pps->scaling_matrix8, h->sps_buffers[pps->sps_id]->scaling_matrix8, sizeof(pps->scaling_matrix8));
7261
7262     if(get_bits_count(&s->gb) < bit_length){
7263         pps->transform_8x8_mode= get_bits1(&s->gb);
7264         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7265         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7266     } else {
7267         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7268     }
7269
7270     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7271     build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7272     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1])
7273         h->pps.chroma_qp_diff= 1;
7274
7275     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7276         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7277                pps_id, pps->sps_id,
7278                pps->cabac ? "CABAC" : "CAVLC",
7279                pps->slice_group_count,
7280                pps->ref_count[0], pps->ref_count[1],
7281                pps->weighted_pred ? "weighted" : "",
7282                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7283                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7284                pps->constrained_intra_pred ? "CONSTR" : "",
7285                pps->redundant_pic_cnt_present ? "REDU" : "",
7286                pps->transform_8x8_mode ? "8x8DCT" : ""
7287                );
7288     }
7289
7290     av_free(h->pps_buffers[pps_id]);
7291     h->pps_buffers[pps_id]= pps;
7292     return 0;
7293 fail:
7294     av_free(pps);
7295     return -1;
7296 }
7297
7298 /**
7299  * Call decode_slice() for each context.
7300  *
7301  * @param h h264 master context
7302  * @param context_count number of contexts to execute
7303  */
7304 static void execute_decode_slices(H264Context *h, int context_count){
7305     MpegEncContext * const s = &h->s;
7306     AVCodecContext * const avctx= s->avctx;
7307     H264Context *hx;
7308     int i;
7309
7310     if(s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
7311         return;
7312     if(context_count == 1) {
7313         decode_slice(avctx, &h);
7314     } else {
7315         for(i = 1; i < context_count; i++) {
7316             hx = h->thread_context[i];
7317             hx->s.error_recognition = avctx->error_recognition;
7318             hx->s.error_count = 0;
7319         }
7320
7321         avctx->execute(avctx, (void *)decode_slice,
7322                        (void **)h->thread_context, NULL, context_count, sizeof(void*));
7323
7324         /* pull back stuff from slices to master context */
7325         hx = h->thread_context[context_count - 1];
7326         s->mb_x = hx->s.mb_x;
7327         s->mb_y = hx->s.mb_y;
7328         s->dropable = hx->s.dropable;
7329         s->picture_structure = hx->s.picture_structure;
7330         for(i = 1; i < context_count; i++)
7331             h->s.error_count += h->thread_context[i]->s.error_count;
7332     }
7333 }
7334
7335
7336 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7337     MpegEncContext * const s = &h->s;
7338     AVCodecContext * const avctx= s->avctx;
7339     int buf_index=0;
7340     H264Context *hx; ///< thread context
7341     int context_count = 0;
7342
7343     h->max_contexts = avctx->thread_count;
7344 #if 0
7345     int i;
7346     for(i=0; i<50; i++){
7347         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7348     }
7349 #endif
7350     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7351         h->current_slice = 0;
7352         if (!s->first_field)
7353             s->current_picture_ptr= NULL;
7354     }
7355
7356     for(;;){
7357         int consumed;
7358         int dst_length;
7359         int bit_length;
7360         const uint8_t *ptr;
7361         int i, nalsize = 0;
7362         int err;
7363
7364         if(h->is_avc) {
7365             if(buf_index >= buf_size) break;
7366             nalsize = 0;
7367             for(i = 0; i < h->nal_length_size; i++)
7368                 nalsize = (nalsize << 8) | buf[buf_index++];
7369             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7370                 if(nalsize == 1){
7371                     buf_index++;
7372                     continue;
7373                 }else{
7374                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7375                     break;
7376                 }
7377             }
7378         } else {
7379             // start code prefix search
7380             for(; buf_index + 3 < buf_size; buf_index++){
7381                 // This should always succeed in the first iteration.
7382                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7383                     break;
7384             }
7385
7386             if(buf_index+3 >= buf_size) break;
7387
7388             buf_index+=3;
7389         }
7390
7391         hx = h->thread_context[context_count];
7392
7393         ptr= decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7394         if (ptr==NULL || dst_length < 0){
7395             return -1;
7396         }
7397         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7398             dst_length--;
7399         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
7400
7401         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7402             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7403         }
7404
7405         if (h->is_avc && (nalsize != consumed)){
7406             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7407             consumed= nalsize;
7408         }
7409
7410         buf_index += consumed;
7411
7412         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7413            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7414             continue;
7415
7416       again:
7417         err = 0;
7418         switch(hx->nal_unit_type){
7419         case NAL_IDR_SLICE:
7420             if (h->nal_unit_type != NAL_IDR_SLICE) {
7421                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7422                 return -1;
7423             }
7424             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7425         case NAL_SLICE:
7426             init_get_bits(&hx->s.gb, ptr, bit_length);
7427             hx->intra_gb_ptr=
7428             hx->inter_gb_ptr= &hx->s.gb;
7429             hx->s.data_partitioning = 0;
7430
7431             if((err = decode_slice_header(hx, h)))
7432                break;
7433
7434             s->current_picture_ptr->key_frame|= (hx->nal_unit_type == NAL_IDR_SLICE);
7435             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7436                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7437                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7438                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7439                && avctx->skip_frame < AVDISCARD_ALL){
7440                 if(CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU){
7441                     static const uint8_t start_code[] = {0x00, 0x00, 0x01};
7442                     ff_vdpau_add_data_chunk(s, start_code, sizeof(start_code));
7443                     ff_vdpau_add_data_chunk(s, &buf[buf_index - consumed], consumed );
7444                 }else
7445                     context_count++;
7446             }
7447             break;
7448         case NAL_DPA:
7449             init_get_bits(&hx->s.gb, ptr, bit_length);
7450             hx->intra_gb_ptr=
7451             hx->inter_gb_ptr= NULL;
7452             hx->s.data_partitioning = 1;
7453
7454             err = decode_slice_header(hx, h);
7455             break;
7456         case NAL_DPB:
7457             init_get_bits(&hx->intra_gb, ptr, bit_length);
7458             hx->intra_gb_ptr= &hx->intra_gb;
7459             break;
7460         case NAL_DPC:
7461             init_get_bits(&hx->inter_gb, ptr, bit_length);
7462             hx->inter_gb_ptr= &hx->inter_gb;
7463
7464             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7465                && s->context_initialized
7466                && s->hurry_up < 5
7467                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7468                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type_nos!=FF_B_TYPE)
7469                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type_nos==FF_I_TYPE)
7470                && avctx->skip_frame < AVDISCARD_ALL)
7471                 context_count++;
7472             break;
7473         case NAL_SEI:
7474             init_get_bits(&s->gb, ptr, bit_length);
7475             decode_sei(h);
7476             break;
7477         case NAL_SPS:
7478             init_get_bits(&s->gb, ptr, bit_length);
7479             decode_seq_parameter_set(h);
7480
7481             if(s->flags& CODEC_FLAG_LOW_DELAY)
7482                 s->low_delay=1;
7483
7484             if(avctx->has_b_frames < 2)
7485                 avctx->has_b_frames= !s->low_delay;
7486             break;
7487         case NAL_PPS:
7488             init_get_bits(&s->gb, ptr, bit_length);
7489
7490             decode_picture_parameter_set(h, bit_length);
7491
7492             break;
7493         case NAL_AUD:
7494         case NAL_END_SEQUENCE:
7495         case NAL_END_STREAM:
7496         case NAL_FILLER_DATA:
7497         case NAL_SPS_EXT:
7498         case NAL_AUXILIARY_SLICE:
7499             break;
7500         default:
7501             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7502         }
7503
7504         if(context_count == h->max_contexts) {
7505             execute_decode_slices(h, context_count);
7506             context_count = 0;
7507         }
7508
7509         if (err < 0)
7510             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7511         else if(err == 1) {
7512             /* Slice could not be decoded in parallel mode, copy down
7513              * NAL unit stuff to context 0 and restart. Note that
7514              * rbsp_buffer is not transferred, but since we no longer
7515              * run in parallel mode this should not be an issue. */
7516             h->nal_unit_type = hx->nal_unit_type;
7517             h->nal_ref_idc   = hx->nal_ref_idc;
7518             hx = h;
7519             goto again;
7520         }
7521     }
7522     if(context_count)
7523         execute_decode_slices(h, context_count);
7524     return buf_index;
7525 }
7526
7527 /**
7528  * returns the number of bytes consumed for building the current frame
7529  */
7530 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7531         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7532         if(pos+10>buf_size) pos=buf_size; // oops ;)
7533
7534         return pos;
7535 }
7536
7537 static int decode_frame(AVCodecContext *avctx,
7538                              void *data, int *data_size,
7539                              const uint8_t *buf, int buf_size)
7540 {
7541     H264Context *h = avctx->priv_data;
7542     MpegEncContext *s = &h->s;
7543     AVFrame *pict = data;
7544     int buf_index;
7545
7546     s->flags= avctx->flags;
7547     s->flags2= avctx->flags2;
7548
7549    /* end of stream, output what is still in the buffers */
7550     if (buf_size == 0) {
7551         Picture *out;
7552         int i, out_idx;
7553
7554 //FIXME factorize this with the output code below
7555         out = h->delayed_pic[0];
7556         out_idx = 0;
7557         for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7558             if(h->delayed_pic[i]->poc < out->poc){
7559                 out = h->delayed_pic[i];
7560                 out_idx = i;
7561             }
7562
7563         for(i=out_idx; h->delayed_pic[i]; i++)
7564             h->delayed_pic[i] = h->delayed_pic[i+1];
7565
7566         if(out){
7567             *data_size = sizeof(AVFrame);
7568             *pict= *(AVFrame*)out;
7569         }
7570
7571         return 0;
7572     }
7573
7574     if(h->is_avc && !h->got_avcC) {
7575         int i, cnt, nalsize;
7576         unsigned char *p = avctx->extradata;
7577         if(avctx->extradata_size < 7) {
7578             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7579             return -1;
7580         }
7581         if(*p != 1) {
7582             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7583             return -1;
7584         }
7585         /* sps and pps in the avcC always have length coded with 2 bytes,
7586            so put a fake nal_length_size = 2 while parsing them */
7587         h->nal_length_size = 2;
7588         // Decode sps from avcC
7589         cnt = *(p+5) & 0x1f; // Number of sps
7590         p += 6;
7591         for (i = 0; i < cnt; i++) {
7592             nalsize = AV_RB16(p) + 2;
7593             if(decode_nal_units(h, p, nalsize) < 0) {
7594                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7595                 return -1;
7596             }
7597             p += nalsize;
7598         }
7599         // Decode pps from avcC
7600         cnt = *(p++); // Number of pps
7601         for (i = 0; i < cnt; i++) {
7602             nalsize = AV_RB16(p) + 2;
7603             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7604                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7605                 return -1;
7606             }
7607             p += nalsize;
7608         }
7609         // Now store right nal length size, that will be use to parse all other nals
7610         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7611         // Do not reparse avcC
7612         h->got_avcC = 1;
7613     }
7614
7615     if(!h->got_avcC && !h->is_avc && s->avctx->extradata_size){
7616         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7617             return -1;
7618         h->got_avcC = 1;
7619     }
7620
7621     buf_index=decode_nal_units(h, buf, buf_size);
7622     if(buf_index < 0)
7623         return -1;
7624
7625     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7626         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7627         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7628         return -1;
7629     }
7630
7631     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7632         Picture *out = s->current_picture_ptr;
7633         Picture *cur = s->current_picture_ptr;
7634         int i, pics, cross_idr, out_of_order, out_idx;
7635
7636         s->mb_y= 0;
7637
7638         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7639         s->current_picture_ptr->pict_type= s->pict_type;
7640
7641         if (CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
7642             ff_vdpau_h264_set_reference_frames(s);
7643
7644         if(!s->dropable) {
7645             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7646             h->prev_poc_msb= h->poc_msb;
7647             h->prev_poc_lsb= h->poc_lsb;
7648         }
7649         h->prev_frame_num_offset= h->frame_num_offset;
7650         h->prev_frame_num= h->frame_num;
7651
7652         if (CONFIG_H264_VDPAU_DECODER && s->avctx->codec->capabilities&CODEC_CAP_HWACCEL_VDPAU)
7653             ff_vdpau_h264_picture_complete(s);
7654
7655         /*
7656          * FIXME: Error handling code does not seem to support interlaced
7657          * when slices span multiple rows
7658          * The ff_er_add_slice calls don't work right for bottom
7659          * fields; they cause massive erroneous error concealing
7660          * Error marking covers both fields (top and bottom).
7661          * This causes a mismatched s->error_count
7662          * and a bad error table. Further, the error count goes to
7663          * INT_MAX when called for bottom field, because mb_y is
7664          * past end by one (callers fault) and resync_mb_y != 0
7665          * causes problems for the first MB line, too.
7666          */
7667         if (!FIELD_PICTURE)
7668             ff_er_frame_end(s);
7669
7670         MPV_frame_end(s);
7671
7672         if (cur->field_poc[0]==INT_MAX || cur->field_poc[1]==INT_MAX) {
7673             /* Wait for second field. */
7674             *data_size = 0;
7675
7676         } else {
7677             cur->repeat_pict = 0;
7678
7679             /* Signal interlacing information externally. */
7680             /* Prioritize picture timing SEI information over used decoding process if it exists. */
7681             if(h->sps.pic_struct_present_flag){
7682                 switch (h->sei_pic_struct)
7683                 {
7684                 case SEI_PIC_STRUCT_FRAME:
7685                     cur->interlaced_frame = 0;
7686                     break;
7687                 case SEI_PIC_STRUCT_TOP_FIELD:
7688                 case SEI_PIC_STRUCT_BOTTOM_FIELD:
7689                 case SEI_PIC_STRUCT_TOP_BOTTOM:
7690                 case SEI_PIC_STRUCT_BOTTOM_TOP:
7691                     cur->interlaced_frame = 1;
7692                     break;
7693                 case SEI_PIC_STRUCT_TOP_BOTTOM_TOP:
7694                 case SEI_PIC_STRUCT_BOTTOM_TOP_BOTTOM:
7695                     // Signal the possibility of telecined film externally (pic_struct 5,6)
7696                     // From these hints, let the applications decide if they apply deinterlacing.
7697                     cur->repeat_pict = 1;
7698                     cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7699                     break;
7700                 case SEI_PIC_STRUCT_FRAME_DOUBLING:
7701                     // Force progressive here, as doubling interlaced frame is a bad idea.
7702                     cur->interlaced_frame = 0;
7703                     cur->repeat_pict = 2;
7704                     break;
7705                 case SEI_PIC_STRUCT_FRAME_TRIPLING:
7706                     cur->interlaced_frame = 0;
7707                     cur->repeat_pict = 4;
7708                     break;
7709                 }
7710             }else{
7711                 /* Derive interlacing flag from used decoding process. */
7712                 cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7713             }
7714
7715             if (cur->field_poc[0] != cur->field_poc[1]){
7716                 /* Derive top_field_first from field pocs. */
7717                 cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7718             }else{
7719                 if(cur->interlaced_frame || h->sps.pic_struct_present_flag){
7720                     /* Use picture timing SEI information. Even if it is a information of a past frame, better than nothing. */
7721                     if(h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM
7722                       || h->sei_pic_struct == SEI_PIC_STRUCT_TOP_BOTTOM_TOP)
7723                         cur->top_field_first = 1;
7724                     else
7725                         cur->top_field_first = 0;
7726                 }else{
7727                     /* Most likely progressive */
7728                     cur->top_field_first = 0;
7729                 }
7730             }
7731
7732         //FIXME do something with unavailable reference frames
7733
7734             /* Sort B-frames into display order */
7735
7736             if(h->sps.bitstream_restriction_flag
7737                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7738                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7739                 s->low_delay = 0;
7740             }
7741
7742             if(   s->avctx->strict_std_compliance >= FF_COMPLIANCE_STRICT
7743                && !h->sps.bitstream_restriction_flag){
7744                 s->avctx->has_b_frames= MAX_DELAYED_PIC_COUNT;
7745                 s->low_delay= 0;
7746             }
7747
7748             pics = 0;
7749             while(h->delayed_pic[pics]) pics++;
7750
7751             assert(pics <= MAX_DELAYED_PIC_COUNT);
7752
7753             h->delayed_pic[pics++] = cur;
7754             if(cur->reference == 0)
7755                 cur->reference = DELAYED_PIC_REF;
7756
7757             out = h->delayed_pic[0];
7758             out_idx = 0;
7759             for(i=1; h->delayed_pic[i] && (h->delayed_pic[i]->poc && !h->delayed_pic[i]->key_frame); i++)
7760                 if(h->delayed_pic[i]->poc < out->poc){
7761                     out = h->delayed_pic[i];
7762                     out_idx = i;
7763                 }
7764             cross_idr = !h->delayed_pic[0]->poc || !!h->delayed_pic[i] || h->delayed_pic[0]->key_frame;
7765
7766             out_of_order = !cross_idr && out->poc < h->outputed_poc;
7767
7768             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7769                 { }
7770             else if((out_of_order && pics-1 == s->avctx->has_b_frames && s->avctx->has_b_frames < MAX_DELAYED_PIC_COUNT)
7771                || (s->low_delay &&
7772                 ((!cross_idr && out->poc > h->outputed_poc + 2)
7773                  || cur->pict_type == FF_B_TYPE)))
7774             {
7775                 s->low_delay = 0;
7776                 s->avctx->has_b_frames++;
7777             }
7778
7779             if(out_of_order || pics > s->avctx->has_b_frames){
7780                 out->reference &= ~DELAYED_PIC_REF;
7781                 for(i=out_idx; h->delayed_pic[i]; i++)
7782                     h->delayed_pic[i] = h->delayed_pic[i+1];
7783             }
7784             if(!out_of_order && pics > s->avctx->has_b_frames){
7785                 *data_size = sizeof(AVFrame);
7786
7787                 h->outputed_poc = out->poc;
7788                 *pict= *(AVFrame*)out;
7789             }else{
7790                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7791             }
7792         }
7793     }
7794
7795     assert(pict->data[0] || !*data_size);
7796     ff_print_debug_info(s, pict);
7797 //printf("out %d\n", (int)pict->data[0]);
7798 #if 0 //?
7799
7800     /* Return the Picture timestamp as the frame number */
7801     /* we subtract 1 because it is added on utils.c     */
7802     avctx->frame_number = s->picture_number - 1;
7803 #endif
7804     return get_consumed_bytes(s, buf_index, buf_size);
7805 }
7806 #if 0
7807 static inline void fill_mb_avail(H264Context *h){
7808     MpegEncContext * const s = &h->s;
7809     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7810
7811     if(s->mb_y){
7812         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7813         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7814         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7815     }else{
7816         h->mb_avail[0]=
7817         h->mb_avail[1]=
7818         h->mb_avail[2]= 0;
7819     }
7820     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7821     h->mb_avail[4]= 1; //FIXME move out
7822     h->mb_avail[5]= 0; //FIXME move out
7823 }
7824 #endif
7825
7826 #ifdef TEST
7827 #undef printf
7828 #undef random
7829 #define COUNT 8000
7830 #define SIZE (COUNT*40)
7831 int main(void){
7832     int i;
7833     uint8_t temp[SIZE];
7834     PutBitContext pb;
7835     GetBitContext gb;
7836 //    int int_temp[10000];
7837     DSPContext dsp;
7838     AVCodecContext avctx;
7839
7840     dsputil_init(&dsp, &avctx);
7841
7842     init_put_bits(&pb, temp, SIZE);
7843     printf("testing unsigned exp golomb\n");
7844     for(i=0; i<COUNT; i++){
7845         START_TIMER
7846         set_ue_golomb(&pb, i);
7847         STOP_TIMER("set_ue_golomb");
7848     }
7849     flush_put_bits(&pb);
7850
7851     init_get_bits(&gb, temp, 8*SIZE);
7852     for(i=0; i<COUNT; i++){
7853         int j, s;
7854
7855         s= show_bits(&gb, 24);
7856
7857         START_TIMER
7858         j= get_ue_golomb(&gb);
7859         if(j != i){
7860             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7861 //            return -1;
7862         }
7863         STOP_TIMER("get_ue_golomb");
7864     }
7865
7866
7867     init_put_bits(&pb, temp, SIZE);
7868     printf("testing signed exp golomb\n");
7869     for(i=0; i<COUNT; i++){
7870         START_TIMER
7871         set_se_golomb(&pb, i - COUNT/2);
7872         STOP_TIMER("set_se_golomb");
7873     }
7874     flush_put_bits(&pb);
7875
7876     init_get_bits(&gb, temp, 8*SIZE);
7877     for(i=0; i<COUNT; i++){
7878         int j, s;
7879
7880         s= show_bits(&gb, 24);
7881
7882         START_TIMER
7883         j= get_se_golomb(&gb);
7884         if(j != i - COUNT/2){
7885             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7886 //            return -1;
7887         }
7888         STOP_TIMER("get_se_golomb");
7889     }
7890
7891 #if 0
7892     printf("testing 4x4 (I)DCT\n");
7893
7894     DCTELEM block[16];
7895     uint8_t src[16], ref[16];
7896     uint64_t error= 0, max_error=0;
7897
7898     for(i=0; i<COUNT; i++){
7899         int j;
7900 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
7901         for(j=0; j<16; j++){
7902             ref[j]= random()%255;
7903             src[j]= random()%255;
7904         }
7905
7906         h264_diff_dct_c(block, src, ref, 4);
7907
7908         //normalize
7909         for(j=0; j<16; j++){
7910 //            printf("%d ", block[j]);
7911             block[j]= block[j]*4;
7912             if(j&1) block[j]= (block[j]*4 + 2)/5;
7913             if(j&4) block[j]= (block[j]*4 + 2)/5;
7914         }
7915 //        printf("\n");
7916
7917         s->dsp.h264_idct_add(ref, block, 4);
7918 /*        for(j=0; j<16; j++){
7919             printf("%d ", ref[j]);
7920         }
7921         printf("\n");*/
7922
7923         for(j=0; j<16; j++){
7924             int diff= FFABS(src[j] - ref[j]);
7925
7926             error+= diff*diff;
7927             max_error= FFMAX(max_error, diff);
7928         }
7929     }
7930     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
7931     printf("testing quantizer\n");
7932     for(qp=0; qp<52; qp++){
7933         for(i=0; i<16; i++)
7934             src1_block[i]= src2_block[i]= random()%255;
7935
7936     }
7937     printf("Testing NAL layer\n");
7938
7939     uint8_t bitstream[COUNT];
7940     uint8_t nal[COUNT*2];
7941     H264Context h;
7942     memset(&h, 0, sizeof(H264Context));
7943
7944     for(i=0; i<COUNT; i++){
7945         int zeros= i;
7946         int nal_length;
7947         int consumed;
7948         int out_length;
7949         uint8_t *out;
7950         int j;
7951
7952         for(j=0; j<COUNT; j++){
7953             bitstream[j]= (random() % 255) + 1;
7954         }
7955
7956         for(j=0; j<zeros; j++){
7957             int pos= random() % COUNT;
7958             while(bitstream[pos] == 0){
7959                 pos++;
7960                 pos %= COUNT;
7961             }
7962             bitstream[pos]=0;
7963         }
7964
7965         START_TIMER
7966
7967         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
7968         if(nal_length<0){
7969             printf("encoding failed\n");
7970             return -1;
7971         }
7972
7973         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
7974
7975         STOP_TIMER("NAL")
7976
7977         if(out_length != COUNT){
7978             printf("incorrect length %d %d\n", out_length, COUNT);
7979             return -1;
7980         }
7981
7982         if(consumed != nal_length){
7983             printf("incorrect consumed length %d %d\n", nal_length, consumed);
7984             return -1;
7985         }
7986
7987         if(memcmp(bitstream, out, COUNT)){
7988             printf("mismatch\n");
7989             return -1;
7990         }
7991     }
7992 #endif
7993
7994     printf("Testing RBSP\n");
7995
7996
7997     return 0;
7998 }
7999 #endif /* TEST */
8000
8001
8002 static av_cold int decode_end(AVCodecContext *avctx)
8003 {
8004     H264Context *h = avctx->priv_data;
8005     MpegEncContext *s = &h->s;
8006     int i;
8007
8008     av_freep(&h->rbsp_buffer[0]);
8009     av_freep(&h->rbsp_buffer[1]);
8010     free_tables(h); //FIXME cleanup init stuff perhaps
8011
8012     for(i = 0; i < MAX_SPS_COUNT; i++)
8013         av_freep(h->sps_buffers + i);
8014
8015     for(i = 0; i < MAX_PPS_COUNT; i++)
8016         av_freep(h->pps_buffers + i);
8017
8018     MPV_common_end(s);
8019
8020 //    memset(h, 0, sizeof(H264Context));
8021
8022     return 0;
8023 }
8024
8025
8026 AVCodec h264_decoder = {
8027     "h264",
8028     CODEC_TYPE_VIDEO,
8029     CODEC_ID_H264,
8030     sizeof(H264Context),
8031     decode_init,
8032     NULL,
8033     decode_end,
8034     decode_frame,
8035     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_DELAY,
8036     .flush= flush_dpb,
8037     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10"),
8038 };
8039
8040 #if CONFIG_H264_VDPAU_DECODER
8041 AVCodec h264_vdpau_decoder = {
8042     "h264_vdpau",
8043     CODEC_TYPE_VIDEO,
8044     CODEC_ID_H264,
8045     sizeof(H264Context),
8046     decode_init,
8047     NULL,
8048     decode_end,
8049     decode_frame,
8050     CODEC_CAP_DR1 | CODEC_CAP_DELAY | CODEC_CAP_HWACCEL_VDPAU,
8051     .flush= flush_dpb,
8052     .long_name = NULL_IF_CONFIG_SMALL("H.264 / AVC / MPEG-4 AVC / MPEG-4 part 10 (VDPAU acceleration)"),
8053 };
8054 #endif
8055
8056 #if CONFIG_SVQ3_DECODER
8057 #include "svq3.c"
8058 #endif