libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file h264.c
  24  * H.264 / AVC / MPEG4 part10 codec.
  25  * @author Michael Niedermayer <michaelni@gmx.at>
  26  */
  27
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264.h"
  32 #include "h264data.h"
  33 #include "h264_parser.h"
  34 #include "golomb.h"
  35 #include "rectangle.h"
  36
  37 #include "cabac.h"
  38
  39 //#undef NDEBUG
  40 #include <assert.h>
  41
  42 /**
  43  * Value of Picture.reference when Picture is not a reference picture, but
  44  * is held for delayed output.
  45  */
  46 #define DELAYED_PIC_REF 4
  47
  48 static VLC coeff_token_vlc[4];
  49 static VLC chroma_dc_coeff_token_vlc;
  50
  51 static VLC total_zeros_vlc[15];
  52 static VLC chroma_dc_total_zeros_vlc[3];
  53
  54 static VLC run_vlc[6];
  55 static VLC run7_vlc;
  56
  57 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
  58 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
  59 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  60 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
  61
  62 static av_always_inline uint32_t pack16to32(int a, int b){
  63 #ifdef WORDS_BIGENDIAN
  64    return (b&0xFFFF) + (a<<16);
  65 #else
  66    return (a&0xFFFF) + (b<<16);
  67 #endif
  68 }
  69
  70 const uint8_t ff_rem6[52]={
  71 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
  72 };
  73
  74 const uint8_t ff_div6[52]={
  75 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
  76 };
  77
  78
  79 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
  80     MpegEncContext * const s = &h->s;
  81     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
  82     int topleft_xy, top_xy, topright_xy, left_xy[2];
  83     int topleft_type, top_type, topright_type, left_type[2];
  84     int left_block[8];
  85     int topleft_partition= -1;
  86     int i;
  87
  88     top_xy     = mb_xy  - (s->mb_stride << FIELD_PICTURE);
  89
  90     //FIXME deblocking could skip the intra and nnz parts.
  91     if(for_deblock && (h->slice_num == 1 || h->slice_table[mb_xy] == h->slice_table[top_xy]) && !FRAME_MBAFF)
  92         return;
  93
  94     /* Wow, what a mess, why didn't they simplify the interlacing & intra
  95      * stuff, I can't imagine that these complex rules are worth it. */
  96
  97     topleft_xy = top_xy - 1;
  98     topright_xy= top_xy + 1;
  99     left_xy[1] = left_xy[0] = mb_xy-1;
 100     left_block[0]= 0;
 101     left_block[1]= 1;
 102     left_block[2]= 2;
 103     left_block[3]= 3;
 104     left_block[4]= 7;
 105     left_block[5]= 10;
 106     left_block[6]= 8;
 107     left_block[7]= 11;
 108     if(FRAME_MBAFF){
 109         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 110         const int top_pair_xy      = pair_xy     - s->mb_stride;
 111         const int topleft_pair_xy  = top_pair_xy - 1;
 112         const int topright_pair_xy = top_pair_xy + 1;
 113         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 114         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 115         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 116         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 117         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 118         const int bottom = (s->mb_y & 1);
 119         tprintf(s->avctx, "fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 120         if (bottom
 121                 ? !curr_mb_frame_flag // bottom macroblock
 122                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 123                 ) {
 124             top_xy -= s->mb_stride;
 125         }
 126         if (bottom
 127                 ? !curr_mb_frame_flag // bottom macroblock
 128                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 129                 ) {
 130             topleft_xy -= s->mb_stride;
 131         } else if(bottom && curr_mb_frame_flag && !left_mb_frame_flag) {
 132             topleft_xy += s->mb_stride;
 133             // take topleft mv from the middle of the mb, as opposed to all other modes which use the bottom-right partition
 134             topleft_partition = 0;
 135         }
 136         if (bottom
 137                 ? !curr_mb_frame_flag // bottom macroblock
 138                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 139                 ) {
 140             topright_xy -= s->mb_stride;
 141         }
 142         if (left_mb_frame_flag != curr_mb_frame_flag) {
 143             left_xy[1] = left_xy[0] = pair_xy - 1;
 144             if (curr_mb_frame_flag) {
 145                 if (bottom) {
 146                     left_block[0]= 2;
 147                     left_block[1]= 2;
 148                     left_block[2]= 3;
 149                     left_block[3]= 3;
 150                     left_block[4]= 8;
 151                     left_block[5]= 11;
 152                     left_block[6]= 8;
 153                     left_block[7]= 11;
 154                 } else {
 155                     left_block[0]= 0;
 156                     left_block[1]= 0;
 157                     left_block[2]= 1;
 158                     left_block[3]= 1;
 159                     left_block[4]= 7;
 160                     left_block[5]= 10;
 161                     left_block[6]= 7;
 162                     left_block[7]= 10;
 163                 }
 164             } else {
 165                 left_xy[1] += s->mb_stride;
 166                 //left_block[0]= 0;
 167                 left_block[1]= 2;
 168                 left_block[2]= 0;
 169                 left_block[3]= 2;
 170                 //left_block[4]= 7;
 171                 left_block[5]= 10;
 172                 left_block[6]= 7;
 173                 left_block[7]= 10;
 174             }
 175         }
 176     }
 177
 178     h->top_mb_xy = top_xy;
 179     h->left_mb_xy[0] = left_xy[0];
 180     h->left_mb_xy[1] = left_xy[1];
 181     if(for_deblock){
 182         topleft_type = 0;
 183         topright_type = 0;
 184         top_type     = h->slice_table[top_xy     ] < 255 ? s->current_picture.mb_type[top_xy]     : 0;
 185         left_type[0] = h->slice_table[left_xy[0] ] < 255 ? s->current_picture.mb_type[left_xy[0]] : 0;
 186         left_type[1] = h->slice_table[left_xy[1] ] < 255 ? s->current_picture.mb_type[left_xy[1]] : 0;
 187
 188         if(FRAME_MBAFF && !IS_INTRA(mb_type)){
 189             int list;
 190             int v = *(uint16_t*)&h->non_zero_count[mb_xy][14];
 191             for(i=0; i<16; i++)
 192                 h->non_zero_count_cache[scan8[i]] = (v>>i)&1;
 193             for(list=0; list<h->list_count; list++){
 194                 if(USES_LIST(mb_type,list)){
 195                     uint32_t *src = (uint32_t*)s->current_picture.motion_val[list][h->mb2b_xy[mb_xy]];
 196                     uint32_t *dst = (uint32_t*)h->mv_cache[list][scan8[0]];
 197                     int8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 198                     for(i=0; i<4; i++, dst+=8, src+=h->b_stride){
 199                         dst[0] = src[0];
 200                         dst[1] = src[1];
 201                         dst[2] = src[2];
 202                         dst[3] = src[3];
 203                     }
 204                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 205                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = pack16to32(ref[0],ref[1])*0x0101;
 206                     ref += h->b8_stride;
 207                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 208                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = pack16to32(ref[0],ref[1])*0x0101;
 209                 }else{
 210                     fill_rectangle(&h-> mv_cache[list][scan8[ 0]], 4, 4, 8, 0, 4);
 211                     fill_rectangle(&h->ref_cache[list][scan8[ 0]], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
 212                 }
 213             }
 214         }
 215     }else{
 216         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 217         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 218         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 219         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 220         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 221     }
 222
 223     if(IS_INTRA(mb_type)){
 224         h->topleft_samples_available=
 225         h->top_samples_available=
 226         h->left_samples_available= 0xFFFF;
 227         h->topright_samples_available= 0xEEEA;
 228
 229         if(!IS_INTRA(top_type) && (top_type==0 || h->pps.constrained_intra_pred)){
 230             h->topleft_samples_available= 0xB3FF;
 231             h->top_samples_available= 0x33FF;
 232             h->topright_samples_available= 0x26EA;
 233         }
 234         for(i=0; i<2; i++){
 235             if(!IS_INTRA(left_type[i]) && (left_type[i]==0 || h->pps.constrained_intra_pred)){
 236                 h->topleft_samples_available&= 0xDF5F;
 237                 h->left_samples_available&= 0x5F5F;
 238             }
 239         }
 240
 241         if(!IS_INTRA(topleft_type) && (topleft_type==0 || h->pps.constrained_intra_pred))
 242             h->topleft_samples_available&= 0x7FFF;
 243
 244         if(!IS_INTRA(topright_type) && (topright_type==0 || h->pps.constrained_intra_pred))
 245             h->topright_samples_available&= 0xFBFF;
 246
 247         if(IS_INTRA4x4(mb_type)){
 248             if(IS_INTRA4x4(top_type)){
 249                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 250                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 251                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 252                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 253             }else{
 254                 int pred;
 255                 if(!top_type || (IS_INTER(top_type) && h->pps.constrained_intra_pred))
 256                     pred= -1;
 257                 else{
 258                     pred= 2;
 259                 }
 260                 h->intra4x4_pred_mode_cache[4+8*0]=
 261                 h->intra4x4_pred_mode_cache[5+8*0]=
 262                 h->intra4x4_pred_mode_cache[6+8*0]=
 263                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 264             }
 265             for(i=0; i<2; i++){
 266                 if(IS_INTRA4x4(left_type[i])){
 267                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 268                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 269                 }else{
 270                     int pred;
 271                     if(!left_type[i] || (IS_INTER(left_type[i]) && h->pps.constrained_intra_pred))
 272                         pred= -1;
 273                     else{
 274                         pred= 2;
 275                     }
 276                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 277                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 278                 }
 279             }
 280         }
 281     }
 282
 283
 284 /*
 285 0 . T T. T T T T
 286 1 L . .L . . . .
 287 2 L . .L . . . .
 288 3 . T TL . . . .
 289 4 L . .L . . . .
 290 5 L . .. . . . .
 291 */
 292 //FIXME constraint_intra_pred & partitioning & nnz (lets hope this is just a typo in the spec)
 293     if(top_type){
 294         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 295         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 296         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 297         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 298
 299         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 300         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 301
 302         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 303         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 304
 305     }else{
 306         h->non_zero_count_cache[4+8*0]=
 307         h->non_zero_count_cache[5+8*0]=
 308         h->non_zero_count_cache[6+8*0]=
 309         h->non_zero_count_cache[7+8*0]=
 310
 311         h->non_zero_count_cache[1+8*0]=
 312         h->non_zero_count_cache[2+8*0]=
 313
 314         h->non_zero_count_cache[1+8*3]=
 315         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 316
 317     }
 318
 319     for (i=0; i<2; i++) {
 320         if(left_type[i]){
 321             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 322             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 323             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 324             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 325         }else{
 326             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 327             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 328             h->non_zero_count_cache[0+8*1 +   8*i]=
 329             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 330         }
 331     }
 332
 333     if( h->pps.cabac ) {
 334         // top_cbp
 335         if(top_type) {
 336             h->top_cbp = h->cbp_table[top_xy];
 337         } else if(IS_INTRA(mb_type)) {
 338             h->top_cbp = 0x1C0;
 339         } else {
 340             h->top_cbp = 0;
 341         }
 342         // left_cbp
 343         if (left_type[0]) {
 344             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 345         } else if(IS_INTRA(mb_type)) {
 346             h->left_cbp = 0x1C0;
 347         } else {
 348             h->left_cbp = 0;
 349         }
 350         if (left_type[0]) {
 351             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 352         }
 353         if (left_type[1]) {
 354             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 355         }
 356     }
 357
 358 #if 1
 359     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 360         int list;
 361         for(list=0; list<h->list_count; list++){
 362             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 363                 /*if(!h->mv_cache_clean[list]){
 364                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 365                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 366                     h->mv_cache_clean[list]= 1;
 367                 }*/
 368                 continue;
 369             }
 370             h->mv_cache_clean[list]= 0;
 371
 372             if(USES_LIST(top_type, list)){
 373                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 374                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 375                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 376                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 377                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 378                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 379                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 380                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 381                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 382                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 383             }else{
 384                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 385                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 386                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 387                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 388                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 389             }
 390
 391             for(i=0; i<2; i++){
 392                 int cache_idx = scan8[0] - 1 + i*2*8;
 393                 if(USES_LIST(left_type[i], list)){
 394                     const int b_xy= h->mb2b_xy[left_xy[i]] + 3;
 395                     const int b8_xy= h->mb2b8_xy[left_xy[i]] + 1;
 396                     *(uint32_t*)h->mv_cache[list][cache_idx  ]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0+i*2]];
 397                     *(uint32_t*)h->mv_cache[list][cache_idx+8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1+i*2]];
 398                     h->ref_cache[list][cache_idx  ]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0+i*2]>>1)];
 399                     h->ref_cache[list][cache_idx+8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1+i*2]>>1)];
 400                 }else{
 401                     *(uint32_t*)h->mv_cache [list][cache_idx  ]=
 402                     *(uint32_t*)h->mv_cache [list][cache_idx+8]= 0;
 403                     h->ref_cache[list][cache_idx  ]=
 404                     h->ref_cache[list][cache_idx+8]= left_type[i] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 405                 }
 406             }
 407
 408             if((for_deblock || (IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred)) && !FRAME_MBAFF)
 409                 continue;
 410
 411             if(USES_LIST(topleft_type, list)){
 412                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + h->b_stride + (topleft_partition & 2*h->b_stride);
 413                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + (topleft_partition & h->b8_stride);
 414                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 415                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 416             }else{
 417                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 418                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 419             }
 420
 421             if(USES_LIST(topright_type, list)){
 422                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 423                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 424                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 425                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 426             }else{
 427                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 428                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 429             }
 430
 431             if((IS_SKIP(mb_type) || IS_DIRECT(mb_type)) && !FRAME_MBAFF)
 432                 continue;
 433
 434             h->ref_cache[list][scan8[5 ]+1] =
 435             h->ref_cache[list][scan8[7 ]+1] =
 436             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 437             h->ref_cache[list][scan8[4 ]] =
 438             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 439             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 440             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 441             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 442             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 443             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 444
 445             if( h->pps.cabac ) {
 446                 /* XXX beurk, Load mvd */
 447                 if(USES_LIST(top_type, list)){
 448                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 449                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 450                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 451                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 452                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 453                 }else{
 454                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 455                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 456                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 457                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 458                 }
 459                 if(USES_LIST(left_type[0], list)){
 460                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 461                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 462                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 463                 }else{
 464                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 465                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 466                 }
 467                 if(USES_LIST(left_type[1], list)){
 468                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 469                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 470                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 471                 }else{
 472                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 473                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 474                 }
 475                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 476                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 477                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 478                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 479                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 480
 481                 if(h->slice_type == FF_B_TYPE){
 482                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 483
 484                     if(IS_DIRECT(top_type)){
 485                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 486                     }else if(IS_8X8(top_type)){
 487                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 488                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 489                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 490                     }else{
 491                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 492                     }
 493
 494                     if(IS_DIRECT(left_type[0]))
 495                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 496                     else if(IS_8X8(left_type[0]))
 497                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 498                     else
 499                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 500
 501                     if(IS_DIRECT(left_type[1]))
 502                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 503                     else if(IS_8X8(left_type[1]))
 504                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 505                     else
 506                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 507                 }
 508             }
 509
 510             if(FRAME_MBAFF){
 511 #define MAP_MVS\
 512                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 513                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 514                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 515                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 516                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 517                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 518                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 519                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 520                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 521                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 522                 if(MB_FIELD){
 523 #define MAP_F2F(idx, mb_type)\
 524                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 525                         h->ref_cache[list][idx] <<= 1;\
 526                         h->mv_cache[list][idx][1] /= 2;\
 527                         h->mvd_cache[list][idx][1] /= 2;\
 528                     }
 529                     MAP_MVS
 530 #undef MAP_F2F
 531                 }else{
 532 #define MAP_F2F(idx, mb_type)\
 533                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 534                         h->ref_cache[list][idx] >>= 1;\
 535                         h->mv_cache[list][idx][1] <<= 1;\
 536                         h->mvd_cache[list][idx][1] <<= 1;\
 537                     }
 538                     MAP_MVS
 539 #undef MAP_F2F
 540                 }
 541             }
 542         }
 543     }
 544 #endif
 545
 546     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 547 }
 548
 549 static inline void write_back_intra_pred_mode(H264Context *h){
 550     MpegEncContext * const s = &h->s;
 551     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 552
 553     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 554     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 555     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 556     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 557     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 558     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 559     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 560 }
 561
 562 /**
 563  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 564  */
 565 static inline int check_intra4x4_pred_mode(H264Context *h){
 566     MpegEncContext * const s = &h->s;
 567     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 568     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 569     int i;
 570
 571     if(!(h->top_samples_available&0x8000)){
 572         for(i=0; i<4; i++){
 573             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 574             if(status<0){
 575                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 576                 return -1;
 577             } else if(status){
 578                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 579             }
 580         }
 581     }
 582
 583     if(!(h->left_samples_available&0x8000)){
 584         for(i=0; i<4; i++){
 585             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 586             if(status<0){
 587                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 588                 return -1;
 589             } else if(status){
 590                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 591             }
 592         }
 593     }
 594
 595     return 0;
 596 } //FIXME cleanup like next
 597
 598 /**
 599  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 600  */
 601 static inline int check_intra_pred_mode(H264Context *h, int mode){
 602     MpegEncContext * const s = &h->s;
 603     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 604     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 605
 606     if(mode > 6U) {
 607         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 608         return -1;
 609     }
 610
 611     if(!(h->top_samples_available&0x8000)){
 612         mode= top[ mode ];
 613         if(mode<0){
 614             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 615             return -1;
 616         }
 617     }
 618
 619     if(!(h->left_samples_available&0x8000)){
 620         mode= left[ mode ];
 621         if(mode<0){
 622             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 623             return -1;
 624         }
 625     }
 626
 627     return mode;
 628 }
 629
 630 /**
 631  * gets the predicted intra4x4 prediction mode.
 632  */
 633 static inline int pred_intra_mode(H264Context *h, int n){
 634     const int index8= scan8[n];
 635     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 636     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 637     const int min= FFMIN(left, top);
 638
 639     tprintf(h->s.avctx, "mode:%d %d min:%d\n", left ,top, min);
 640
 641     if(min<0) return DC_PRED;
 642     else      return min;
 643 }
 644
 645 static inline void write_back_non_zero_count(H264Context *h){
 646     MpegEncContext * const s = &h->s;
 647     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 648
 649     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 650     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 651     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 652     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 653     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 654     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 655     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 656
 657     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 658     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 659     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 660
 661     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 662     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 663     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 664
 665     if(FRAME_MBAFF){
 666         // store all luma nnzs, for deblocking
 667         int v = 0, i;
 668         for(i=0; i<16; i++)
 669             v += (!!h->non_zero_count_cache[scan8[i]]) << i;
 670         *(uint16_t*)&h->non_zero_count[mb_xy][14] = v;
 671     }
 672 }
 673
 674 /**
 675  * gets the predicted number of non zero coefficients.
 676  * @param n block index
 677  */
 678 static inline int pred_non_zero_count(H264Context *h, int n){
 679     const int index8= scan8[n];
 680     const int left= h->non_zero_count_cache[index8 - 1];
 681     const int top = h->non_zero_count_cache[index8 - 8];
 682     int i= left + top;
 683
 684     if(i<64) i= (i+1)>>1;
 685
 686     tprintf(h->s.avctx, "pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 687
 688     return i&31;
 689 }
 690
 691 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 692     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 693     MpegEncContext *s = &h->s;
 694
 695     /* there is no consistent mapping of mvs to neighboring locations that will
 696      * make mbaff happy, so we can't move all this logic to fill_caches */
 697     if(FRAME_MBAFF){
 698         const uint32_t *mb_types = s->current_picture_ptr->mb_type;
 699         const int16_t *mv;
 700         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
 701         *C = h->mv_cache[list][scan8[0]-2];
 702
 703         if(!MB_FIELD
 704            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
 705             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
 706             if(IS_INTERLACED(mb_types[topright_xy])){
 707 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
 708                 const int x4 = X4, y4 = Y4;\
 709                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
 710                 if(!USES_LIST(mb_type,list))\
 711                     return LIST_NOT_USED;\
 712                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
 713                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
 714                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
 715                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
 716
 717                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
 718             }
 719         }
 720         if(topright_ref == PART_NOT_AVAILABLE
 721            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
 722            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
 723             if(!MB_FIELD
 724                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
 725                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
 726             }
 727             if(MB_FIELD
 728                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
 729                && i >= scan8[0]+8){
 730                 // leftshift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's ok.
 731                 SET_DIAG_MV(/2, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
 732             }
 733         }
 734 #undef SET_DIAG_MV
 735     }
 736
 737     if(topright_ref != PART_NOT_AVAILABLE){
 738         *C= h->mv_cache[list][ i - 8 + part_width ];
 739         return topright_ref;
 740     }else{
 741         tprintf(s->avctx, "topright MV not available\n");
 742
 743         *C= h->mv_cache[list][ i - 8 - 1 ];
 744         return h->ref_cache[list][ i - 8 - 1 ];
 745     }
 746 }
 747
 748 /**
 749  * gets the predicted MV.
 750  * @param n the block index
 751  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 752  * @param mx the x component of the predicted motion vector
 753  * @param my the y component of the predicted motion vector
 754  */
 755 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 756     const int index8= scan8[n];
 757     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 758     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 759     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 760     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 761     const int16_t * C;
 762     int diagonal_ref, match_count;
 763
 764     assert(part_width==1 || part_width==2 || part_width==4);
 765
 766 /* mv_cache
 767   B . . A T T T T
 768   U . . L . . , .
 769   U . . L . . . .
 770   U . . L . . , .
 771   . . . L . . . .
 772 */
 773
 774     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 775     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 776     tprintf(h->s.avctx, "pred_motion match_count=%d\n", match_count);
 777     if(match_count > 1){ //most common
 778         *mx= mid_pred(A[0], B[0], C[0]);
 779         *my= mid_pred(A[1], B[1], C[1]);
 780     }else if(match_count==1){
 781         if(left_ref==ref){
 782             *mx= A[0];
 783             *my= A[1];
 784         }else if(top_ref==ref){
 785             *mx= B[0];
 786             *my= B[1];
 787         }else{
 788             *mx= C[0];
 789             *my= C[1];
 790         }
 791     }else{
 792         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
 793             *mx= A[0];
 794             *my= A[1];
 795         }else{
 796             *mx= mid_pred(A[0], B[0], C[0]);
 797             *my= mid_pred(A[1], B[1], C[1]);
 798         }
 799     }
 800
 801     tprintf(h->s.avctx, "pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
 802 }
 803
 804 /**
 805  * gets the directionally predicted 16x8 MV.
 806  * @param n the block index
 807  * @param mx the x component of the predicted motion vector
 808  * @param my the y component of the predicted motion vector
 809  */
 810 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 811     if(n==0){
 812         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
 813         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
 814
 815         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
 816
 817         if(top_ref == ref){
 818             *mx= B[0];
 819             *my= B[1];
 820             return;
 821         }
 822     }else{
 823         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
 824         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
 825
 826         tprintf(h->s.avctx, "pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 827
 828         if(left_ref == ref){
 829             *mx= A[0];
 830             *my= A[1];
 831             return;
 832         }
 833     }
 834
 835     //RARE
 836     pred_motion(h, n, 4, list, ref, mx, my);
 837 }
 838
 839 /**
 840  * gets the directionally predicted 8x16 MV.
 841  * @param n the block index
 842  * @param mx the x component of the predicted motion vector
 843  * @param my the y component of the predicted motion vector
 844  */
 845 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
 846     if(n==0){
 847         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
 848         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
 849
 850         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
 851
 852         if(left_ref == ref){
 853             *mx= A[0];
 854             *my= A[1];
 855             return;
 856         }
 857     }else{
 858         const int16_t * C;
 859         int diagonal_ref;
 860
 861         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
 862
 863         tprintf(h->s.avctx, "pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
 864
 865         if(diagonal_ref == ref){
 866             *mx= C[0];
 867             *my= C[1];
 868             return;
 869         }
 870     }
 871
 872     //RARE
 873     pred_motion(h, n, 2, list, ref, mx, my);
 874 }
 875
 876 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
 877     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
 878     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
 879
 880     tprintf(h->s.avctx, "pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
 881
 882     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
 883        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
 884        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
 885
 886         *mx = *my = 0;
 887         return;
 888     }
 889
 890     pred_motion(h, 0, 4, 0, 0, mx, my);
 891
 892     return;
 893 }
 894
 895 static inline void direct_dist_scale_factor(H264Context * const h){
 896     const int poc = h->s.current_picture_ptr->poc;
 897     const int poc1 = h->ref_list[1][0].poc;
 898     int i;
 899     for(i=0; i<h->ref_count[0]; i++){
 900         int poc0 = h->ref_list[0][i].poc;
 901         int td = av_clip(poc1 - poc0, -128, 127);
 902         if(td == 0 /* FIXME || pic0 is a long-term ref */){
 903             h->dist_scale_factor[i] = 256;
 904         }else{
 905             int tb = av_clip(poc - poc0, -128, 127);
 906             int tx = (16384 + (FFABS(td) >> 1)) / td;
 907             h->dist_scale_factor[i] = av_clip((tb*tx + 32) >> 6, -1024, 1023);
 908         }
 909     }
 910     if(FRAME_MBAFF){
 911         for(i=0; i<h->ref_count[0]; i++){
 912             h->dist_scale_factor_field[2*i] =
 913             h->dist_scale_factor_field[2*i+1] = h->dist_scale_factor[i];
 914         }
 915     }
 916 }
 917 static inline void direct_ref_list_init(H264Context * const h){
 918     MpegEncContext * const s = &h->s;
 919     Picture * const ref1 = &h->ref_list[1][0];
 920     Picture * const cur = s->current_picture_ptr;
 921     int list, i, j;
 922     if(cur->pict_type == FF_I_TYPE)
 923         cur->ref_count[0] = 0;
 924     if(cur->pict_type != FF_B_TYPE)
 925         cur->ref_count[1] = 0;
 926     for(list=0; list<2; list++){
 927         cur->ref_count[list] = h->ref_count[list];
 928         for(j=0; j<h->ref_count[list]; j++)
 929             cur->ref_poc[list][j] = h->ref_list[list][j].poc;
 930     }
 931     if(cur->pict_type != FF_B_TYPE || h->direct_spatial_mv_pred)
 932         return;
 933     for(list=0; list<2; list++){
 934         for(i=0; i<ref1->ref_count[list]; i++){
 935             const int poc = ref1->ref_poc[list][i];
 936             h->map_col_to_list0[list][i] = 0; /* bogus; fills in for missing frames */
 937             for(j=0; j<h->ref_count[list]; j++)
 938                 if(h->ref_list[list][j].poc == poc){
 939                     h->map_col_to_list0[list][i] = j;
 940                     break;
 941                 }
 942         }
 943     }
 944     if(FRAME_MBAFF){
 945         for(list=0; list<2; list++){
 946             for(i=0; i<ref1->ref_count[list]; i++){
 947                 j = h->map_col_to_list0[list][i];
 948                 h->map_col_to_list0_field[list][2*i] = 2*j;
 949                 h->map_col_to_list0_field[list][2*i+1] = 2*j+1;
 950             }
 951         }
 952     }
 953 }
 954
 955 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
 956     MpegEncContext * const s = &h->s;
 957     const int mb_xy =   s->mb_x +   s->mb_y*s->mb_stride;
 958     const int b8_xy = 2*s->mb_x + 2*s->mb_y*h->b8_stride;
 959     const int b4_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
 960     const int mb_type_col = h->ref_list[1][0].mb_type[mb_xy];
 961     const int16_t (*l1mv0)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[0][b4_xy];
 962     const int16_t (*l1mv1)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[1][b4_xy];
 963     const int8_t *l1ref0 = &h->ref_list[1][0].ref_index[0][b8_xy];
 964     const int8_t *l1ref1 = &h->ref_list[1][0].ref_index[1][b8_xy];
 965     const int is_b8x8 = IS_8X8(*mb_type);
 966     unsigned int sub_mb_type;
 967     int i8, i4;
 968
 969 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
 970     if(IS_8X8(mb_type_col) && !h->sps.direct_8x8_inference_flag){
 971         /* FIXME save sub mb types from previous frames (or derive from MVs)
 972          * so we know exactly what block size to use */
 973         sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
 974         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
 975     }else if(!is_b8x8 && (mb_type_col & MB_TYPE_16x16_OR_INTRA)){
 976         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
 977         *mb_type =    MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
 978     }else{
 979         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
 980         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
 981     }
 982     if(!is_b8x8)
 983         *mb_type |= MB_TYPE_DIRECT2;
 984     if(MB_FIELD)
 985         *mb_type |= MB_TYPE_INTERLACED;
 986
 987     tprintf(s->avctx, "mb_type = %08x, sub_mb_type = %08x, is_b8x8 = %d, mb_type_col = %08x\n", *mb_type, sub_mb_type, is_b8x8, mb_type_col);
 988
 989     if(h->direct_spatial_mv_pred){
 990         int ref[2];
 991         int mv[2][2];
 992         int list;
 993
 994         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
 995
 996         /* ref = min(neighbors) */
 997         for(list=0; list<2; list++){
 998             int refa = h->ref_cache[list][scan8[0] - 1];
 999             int refb = h->ref_cache[list][scan8[0] - 8];
1000             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1001             if(refc == -2)
1002                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1003             ref[list] = refa;
1004             if(ref[list] < 0 || (refb < ref[list] && refb >= 0))
1005                 ref[list] = refb;
1006             if(ref[list] < 0 || (refc < ref[list] && refc >= 0))
1007                 ref[list] = refc;
1008             if(ref[list] < 0)
1009                 ref[list] = -1;
1010         }
1011
1012         if(ref[0] < 0 && ref[1] < 0){
1013             ref[0] = ref[1] = 0;
1014             mv[0][0] = mv[0][1] =
1015             mv[1][0] = mv[1][1] = 0;
1016         }else{
1017             for(list=0; list<2; list++){
1018                 if(ref[list] >= 0)
1019                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1020                 else
1021                     mv[list][0] = mv[list][1] = 0;
1022             }
1023         }
1024
1025         if(ref[1] < 0){
1026             if(!is_b8x8)
1027                 *mb_type &= ~MB_TYPE_L1;
1028             sub_mb_type &= ~MB_TYPE_L1;
1029         }else if(ref[0] < 0){
1030             if(!is_b8x8)
1031                 *mb_type &= ~MB_TYPE_L0;
1032             sub_mb_type &= ~MB_TYPE_L0;
1033         }
1034
1035         if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
1036             int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
1037             int mb_types_col[2];
1038             int b8_stride = h->b8_stride;
1039             int b4_stride = h->b_stride;
1040
1041             *mb_type = (*mb_type & ~MB_TYPE_16x16) | MB_TYPE_8x8;
1042
1043             if(IS_INTERLACED(*mb_type)){
1044                 mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
1045                 mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1046                 if(s->mb_y&1){
1047                     l1ref0 -= 2*b8_stride;
1048                     l1ref1 -= 2*b8_stride;
1049                     l1mv0 -= 4*b4_stride;
1050                     l1mv1 -= 4*b4_stride;
1051                 }
1052                 b8_stride *= 3;
1053                 b4_stride *= 6;
1054             }else{
1055                 int cur_poc = s->current_picture_ptr->poc;
1056                 int *col_poc = h->ref_list[1]->field_poc;
1057                 int col_parity = FFABS(col_poc[0] - cur_poc) >= FFABS(col_poc[1] - cur_poc);
1058                 int dy = 2*col_parity - (s->mb_y&1);
1059                 mb_types_col[0] =
1060                 mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy + col_parity*s->mb_stride];
1061                 l1ref0 += dy*b8_stride;
1062                 l1ref1 += dy*b8_stride;
1063                 l1mv0 += 2*dy*b4_stride;
1064                 l1mv1 += 2*dy*b4_stride;
1065                 b8_stride = 0;
1066             }
1067
1068             for(i8=0; i8<4; i8++){
1069                 int x8 = i8&1;
1070                 int y8 = i8>>1;
1071                 int xy8 = x8+y8*b8_stride;
1072                 int xy4 = 3*x8+y8*b4_stride;
1073                 int a=0, b=0;
1074
1075                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1076                     continue;
1077                 h->sub_mb_type[i8] = sub_mb_type;
1078
1079                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1080                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1081                 if(!IS_INTRA(mb_types_col[y8])
1082                    && (   (l1ref0[xy8] == 0 && FFABS(l1mv0[xy4][0]) <= 1 && FFABS(l1mv0[xy4][1]) <= 1)
1083                        || (l1ref0[xy8]  < 0 && l1ref1[xy8] == 0 && FFABS(l1mv1[xy4][0]) <= 1 && FFABS(l1mv1[xy4][1]) <= 1))){
1084                     if(ref[0] > 0)
1085                         a= pack16to32(mv[0][0],mv[0][1]);
1086                     if(ref[1] > 0)
1087                         b= pack16to32(mv[1][0],mv[1][1]);
1088                 }else{
1089                     a= pack16to32(mv[0][0],mv[0][1]);
1090                     b= pack16to32(mv[1][0],mv[1][1]);
1091                 }
1092                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, a, 4);
1093                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, b, 4);
1094             }
1095         }else if(IS_16X16(*mb_type)){
1096             int a=0, b=0;
1097
1098             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1099             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1100             if(!IS_INTRA(mb_type_col)
1101                && (   (l1ref0[0] == 0 && FFABS(l1mv0[0][0]) <= 1 && FFABS(l1mv0[0][1]) <= 1)
1102                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && FFABS(l1mv1[0][0]) <= 1 && FFABS(l1mv1[0][1]) <= 1
1103                        && (h->x264_build>33 || !h->x264_build)))){
1104                 if(ref[0] > 0)
1105                     a= pack16to32(mv[0][0],mv[0][1]);
1106                 if(ref[1] > 0)
1107                     b= pack16to32(mv[1][0],mv[1][1]);
1108             }else{
1109                 a= pack16to32(mv[0][0],mv[0][1]);
1110                 b= pack16to32(mv[1][0],mv[1][1]);
1111             }
1112             fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, a, 4);
1113             fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, b, 4);
1114         }else{
1115             for(i8=0; i8<4; i8++){
1116                 const int x8 = i8&1;
1117                 const int y8 = i8>>1;
1118
1119                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1120                     continue;
1121                 h->sub_mb_type[i8] = sub_mb_type;
1122
1123                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1124                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1125                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1126                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1127
1128                 /* col_zero_flag */
1129                 if(!IS_INTRA(mb_type_col) && (   l1ref0[x8 + y8*h->b8_stride] == 0
1130                                               || (l1ref0[x8 + y8*h->b8_stride] < 0 && l1ref1[x8 + y8*h->b8_stride] == 0
1131                                                   && (h->x264_build>33 || !h->x264_build)))){
1132                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*h->b8_stride] == 0 ? l1mv0 : l1mv1;
1133                     if(IS_SUB_8X8(sub_mb_type)){
1134                         const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1135                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1136                             if(ref[0] == 0)
1137                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1138                             if(ref[1] == 0)
1139                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1140                         }
1141                     }else
1142                     for(i4=0; i4<4; i4++){
1143                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1144                         if(FFABS(mv_col[0]) <= 1 && FFABS(mv_col[1]) <= 1){
1145                             if(ref[0] == 0)
1146                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1147                             if(ref[1] == 0)
1148                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1149                         }
1150                     }
1151                 }
1152             }
1153         }
1154     }else{ /* direct temporal mv pred */
1155         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1156         const int *dist_scale_factor = h->dist_scale_factor;
1157
1158         if(FRAME_MBAFF){
1159             if(IS_INTERLACED(*mb_type)){
1160                 map_col_to_list0[0] = h->map_col_to_list0_field[0];
1161                 map_col_to_list0[1] = h->map_col_to_list0_field[1];
1162                 dist_scale_factor = h->dist_scale_factor_field;
1163             }
1164             if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
1165                 /* FIXME assumes direct_8x8_inference == 1 */
1166                 const int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
1167                 int mb_types_col[2];
1168                 int y_shift;
1169
1170                 *mb_type = MB_TYPE_8x8|MB_TYPE_L0L1
1171                          | (is_b8x8 ? 0 : MB_TYPE_DIRECT2)
1172                          | (*mb_type & MB_TYPE_INTERLACED);
1173                 sub_mb_type = MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_16x16;
1174
1175                 if(IS_INTERLACED(*mb_type)){
1176                     /* frame to field scaling */
1177                     mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
1178                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1179                     if(s->mb_y&1){
1180                         l1ref0 -= 2*h->b8_stride;
1181                         l1ref1 -= 2*h->b8_stride;
1182                         l1mv0 -= 4*h->b_stride;
1183                         l1mv1 -= 4*h->b_stride;
1184                     }
1185                     y_shift = 0;
1186
1187                     if(   (mb_types_col[0] & MB_TYPE_16x16_OR_INTRA)
1188                        && (mb_types_col[1] & MB_TYPE_16x16_OR_INTRA)
1189                        && !is_b8x8)
1190                         *mb_type |= MB_TYPE_16x8;
1191                     else
1192                         *mb_type |= MB_TYPE_8x8;
1193                 }else{
1194                     /* field to frame scaling */
1195                     /* col_mb_y = (mb_y&~1) + (topAbsDiffPOC < bottomAbsDiffPOC ? 0 : 1)
1196                      * but in MBAFF, top and bottom POC are equal */
1197                     int dy = (s->mb_y&1) ? 1 : 2;
1198                     mb_types_col[0] =
1199                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1200                     l1ref0 += dy*h->b8_stride;
1201                     l1ref1 += dy*h->b8_stride;
1202                     l1mv0 += 2*dy*h->b_stride;
1203                     l1mv1 += 2*dy*h->b_stride;
1204                     y_shift = 2;
1205
1206                     if((mb_types_col[0] & (MB_TYPE_16x16_OR_INTRA|MB_TYPE_16x8))
1207                        && !is_b8x8)
1208                         *mb_type |= MB_TYPE_16x16;
1209                     else
1210                         *mb_type |= MB_TYPE_8x8;
1211                 }
1212
1213                 for(i8=0; i8<4; i8++){
1214                     const int x8 = i8&1;
1215                     const int y8 = i8>>1;
1216                     int ref0, scale;
1217                     const int16_t (*l1mv)[2]= l1mv0;
1218
1219                     if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1220                         continue;
1221                     h->sub_mb_type[i8] = sub_mb_type;
1222
1223                     fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1224                     if(IS_INTRA(mb_types_col[y8])){
1225                         fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1226                         fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1227                         fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1228                         continue;
1229                     }
1230
1231                     ref0 = l1ref0[x8 + (y8*2>>y_shift)*h->b8_stride];
1232                     if(ref0 >= 0)
1233                         ref0 = map_col_to_list0[0][ref0*2>>y_shift];
1234                     else{
1235                         ref0 = map_col_to_list0[1][l1ref1[x8 + (y8*2>>y_shift)*h->b8_stride]*2>>y_shift];
1236                         l1mv= l1mv1;
1237                     }
1238                     scale = dist_scale_factor[ref0];
1239                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1240
1241                     {
1242                         const int16_t *mv_col = l1mv[x8*3 + (y8*6>>y_shift)*h->b_stride];
1243                         int my_col = (mv_col[1]<<y_shift)/2;
1244                         int mx = (scale * mv_col[0] + 128) >> 8;
1245                         int my = (scale * my_col + 128) >> 8;
1246                         fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1247                         fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1248                     }
1249                 }
1250                 return;
1251             }
1252         }
1253
1254         /* one-to-one mv scaling */
1255
1256         if(IS_16X16(*mb_type)){
1257             int ref, mv0, mv1;
1258
1259             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1260             if(IS_INTRA(mb_type_col)){
1261                 ref=mv0=mv1=0;
1262             }else{
1263                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]]
1264                                                 : map_col_to_list0[1][l1ref1[0]];
1265                 const int scale = dist_scale_factor[ref0];
1266                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1267                 int mv_l0[2];
1268                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1269                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1270                 ref= ref0;
1271                 mv0= pack16to32(mv_l0[0],mv_l0[1]);
1272                 mv1= pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1273             }
1274             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref, 1);
1275             fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, mv0, 4);
1276             fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, mv1, 4);
1277         }else{
1278             for(i8=0; i8<4; i8++){
1279                 const int x8 = i8&1;
1280                 const int y8 = i8>>1;
1281                 int ref0, scale;
1282                 const int16_t (*l1mv)[2]= l1mv0;
1283
1284                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1285                     continue;
1286                 h->sub_mb_type[i8] = sub_mb_type;
1287                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1288                 if(IS_INTRA(mb_type_col)){
1289                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1290                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1291                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1292                     continue;
1293                 }
1294
1295                 ref0 = l1ref0[x8 + y8*h->b8_stride];
1296                 if(ref0 >= 0)
1297                     ref0 = map_col_to_list0[0][ref0];
1298                 else{
1299                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*h->b8_stride]];
1300                     l1mv= l1mv1;
1301                 }
1302                 scale = dist_scale_factor[ref0];
1303
1304                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1305                 if(IS_SUB_8X8(sub_mb_type)){
1306                     const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1307                     int mx = (scale * mv_col[0] + 128) >> 8;
1308                     int my = (scale * mv_col[1] + 128) >> 8;
1309                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1310                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1311                 }else
1312                 for(i4=0; i4<4; i4++){
1313                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1314                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1315                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1316                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1317                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1318                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1319                 }
1320             }
1321         }
1322     }
1323 }
1324
1325 static inline void write_back_motion(H264Context *h, int mb_type){
1326     MpegEncContext * const s = &h->s;
1327     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1328     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1329     int list;
1330
1331     if(!USES_LIST(mb_type, 0))
1332         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1333
1334     for(list=0; list<h->list_count; list++){
1335         int y;
1336         if(!USES_LIST(mb_type, list))
1337             continue;
1338
1339         for(y=0; y<4; y++){
1340             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1341             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1342         }
1343         if( h->pps.cabac ) {
1344             if(IS_SKIP(mb_type))
1345                 fill_rectangle(h->mvd_table[list][b_xy], 4, 4, h->b_stride, 0, 4);
1346             else
1347             for(y=0; y<4; y++){
1348                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1349                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1350             }
1351         }
1352
1353         {
1354             int8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1355             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1356             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1357             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1358             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1359         }
1360     }
1361
1362     if(h->slice_type == FF_B_TYPE && h->pps.cabac){
1363         if(IS_8X8(mb_type)){
1364             uint8_t *direct_table = &h->direct_table[b8_xy];
1365             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1366             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1367             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1368         }
1369     }
1370 }
1371
1372 /**
1373  * Decodes a network abstraction layer unit.
1374  * @param consumed is the number of bytes used as input
1375  * @param length is the length of the array
1376  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1377  * @returns decoded bytes, might be src+1 if no escapes
1378  */
1379 static const uint8_t *decode_nal(H264Context *h, const uint8_t *src, int *dst_length, int *consumed, int length){
1380     int i, si, di;
1381     uint8_t *dst;
1382     int bufidx;
1383
1384 //    src[0]&0x80;                //forbidden bit
1385     h->nal_ref_idc= src[0]>>5;
1386     h->nal_unit_type= src[0]&0x1F;
1387
1388     src++; length--;
1389 #if 0
1390     for(i=0; i<length; i++)
1391         printf("%2X ", src[i]);
1392 #endif
1393     for(i=0; i+1<length; i+=2){
1394         if(src[i]) continue;
1395         if(i>0 && src[i-1]==0) i--;
1396         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1397             if(src[i+2]!=3){
1398                 /* startcode, so we must be past the end */
1399                 length=i;
1400             }
1401             break;
1402         }
1403     }
1404
1405     if(i>=length-1){ //no escaped 0
1406         *dst_length= length;
1407         *consumed= length+1; //+1 for the header
1408         return src;
1409     }
1410
1411     bufidx = h->nal_unit_type == NAL_DPC ? 1 : 0; // use second escape buffer for inter data
1412     h->rbsp_buffer[bufidx]= av_fast_realloc(h->rbsp_buffer[bufidx], &h->rbsp_buffer_size[bufidx], length);
1413     dst= h->rbsp_buffer[bufidx];
1414
1415     if (dst == NULL){
1416         return NULL;
1417     }
1418
1419 //printf("decoding esc\n");
1420     si=di=0;
1421     while(si<length){
1422         //remove escapes (very rare 1:2^22)
1423         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1424             if(src[si+2]==3){ //escape
1425                 dst[di++]= 0;
1426                 dst[di++]= 0;
1427                 si+=3;
1428                 continue;
1429             }else //next start code
1430                 break;
1431         }
1432
1433         dst[di++]= src[si++];
1434     }
1435
1436     *dst_length= di;
1437     *consumed= si + 1;//+1 for the header
1438 //FIXME store exact number of bits in the getbitcontext (it is needed for decoding)
1439     return dst;
1440 }
1441
1442 /**
1443  * identifies the exact end of the bitstream
1444  * @return the length of the trailing, or 0 if damaged
1445  */
1446 static int decode_rbsp_trailing(H264Context *h, const uint8_t *src){
1447     int v= *src;
1448     int r;
1449
1450     tprintf(h->s.avctx, "rbsp trailing %X\n", v);
1451
1452     for(r=1; r<9; r++){
1453         if(v&1) return r;
1454         v>>=1;
1455     }
1456     return 0;
1457 }
1458
1459 /**
1460  * idct tranforms the 16 dc values and dequantize them.
1461  * @param qp quantization parameter
1462  */
1463 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1464 #define stride 16
1465     int i;
1466     int temp[16]; //FIXME check if this is a good idea
1467     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1468     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1469
1470 //memset(block, 64, 2*256);
1471 //return;
1472     for(i=0; i<4; i++){
1473         const int offset= y_offset[i];
1474         const int z0= block[offset+stride*0] + block[offset+stride*4];
1475         const int z1= block[offset+stride*0] - block[offset+stride*4];
1476         const int z2= block[offset+stride*1] - block[offset+stride*5];
1477         const int z3= block[offset+stride*1] + block[offset+stride*5];
1478
1479         temp[4*i+0]= z0+z3;
1480         temp[4*i+1]= z1+z2;
1481         temp[4*i+2]= z1-z2;
1482         temp[4*i+3]= z0-z3;
1483     }
1484
1485     for(i=0; i<4; i++){
1486         const int offset= x_offset[i];
1487         const int z0= temp[4*0+i] + temp[4*2+i];
1488         const int z1= temp[4*0+i] - temp[4*2+i];
1489         const int z2= temp[4*1+i] - temp[4*3+i];
1490         const int z3= temp[4*1+i] + temp[4*3+i];
1491
1492         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_resdual
1493         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1494         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1495         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1496     }
1497 }
1498
1499 #if 0
1500 /**
1501  * dct tranforms the 16 dc values.
1502  * @param qp quantization parameter ??? FIXME
1503  */
1504 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1505 //    const int qmul= dequant_coeff[qp][0];
1506     int i;
1507     int temp[16]; //FIXME check if this is a good idea
1508     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1509     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1510
1511     for(i=0; i<4; i++){
1512         const int offset= y_offset[i];
1513         const int z0= block[offset+stride*0] + block[offset+stride*4];
1514         const int z1= block[offset+stride*0] - block[offset+stride*4];
1515         const int z2= block[offset+stride*1] - block[offset+stride*5];
1516         const int z3= block[offset+stride*1] + block[offset+stride*5];
1517
1518         temp[4*i+0]= z0+z3;
1519         temp[4*i+1]= z1+z2;
1520         temp[4*i+2]= z1-z2;
1521         temp[4*i+3]= z0-z3;
1522     }
1523
1524     for(i=0; i<4; i++){
1525         const int offset= x_offset[i];
1526         const int z0= temp[4*0+i] + temp[4*2+i];
1527         const int z1= temp[4*0+i] - temp[4*2+i];
1528         const int z2= temp[4*1+i] - temp[4*3+i];
1529         const int z3= temp[4*1+i] + temp[4*3+i];
1530
1531         block[stride*0 +offset]= (z0 + z3)>>1;
1532         block[stride*2 +offset]= (z1 + z2)>>1;
1533         block[stride*8 +offset]= (z1 - z2)>>1;
1534         block[stride*10+offset]= (z0 - z3)>>1;
1535     }
1536 }
1537 #endif
1538
1539 #undef xStride
1540 #undef stride
1541
1542 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1543     const int stride= 16*2;
1544     const int xStride= 16;
1545     int a,b,c,d,e;
1546
1547     a= block[stride*0 + xStride*0];
1548     b= block[stride*0 + xStride*1];
1549     c= block[stride*1 + xStride*0];
1550     d= block[stride*1 + xStride*1];
1551
1552     e= a-b;
1553     a= a+b;
1554     b= c-d;
1555     c= c+d;
1556
1557     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1558     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1559     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1560     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1561 }
1562
1563 #if 0
1564 static void chroma_dc_dct_c(DCTELEM *block){
1565     const int stride= 16*2;
1566     const int xStride= 16;
1567     int a,b,c,d,e;
1568
1569     a= block[stride*0 + xStride*0];
1570     b= block[stride*0 + xStride*1];
1571     c= block[stride*1 + xStride*0];
1572     d= block[stride*1 + xStride*1];
1573
1574     e= a-b;
1575     a= a+b;
1576     b= c-d;
1577     c= c+d;
1578
1579     block[stride*0 + xStride*0]= (a+c);
1580     block[stride*0 + xStride*1]= (e+b);
1581     block[stride*1 + xStride*0]= (a-c);
1582     block[stride*1 + xStride*1]= (e-b);
1583 }
1584 #endif
1585
1586 /**
1587  * gets the chroma qp.
1588  */
1589 static inline int get_chroma_qp(H264Context *h, int t, int qscale){
1590     return h->pps.chroma_qp_table[t][qscale & 0xff];
1591 }
1592
1593 //FIXME need to check that this does not overflow signed 32 bit for low qp, I am not sure, it's very close
1594 //FIXME check that gcc inlines this (and optimizes intra & separate_dc stuff away)
1595 static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int separate_dc){
1596     int i;
1597     const int * const quant_table= quant_coeff[qscale];
1598     const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
1599     const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
1600     const unsigned int threshold2= (threshold1<<1);
1601     int last_non_zero;
1602
1603     if(separate_dc){
1604         if(qscale<=18){
1605             //avoid overflows
1606             const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
1607             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
1608             const unsigned int dc_threshold2= (dc_threshold1<<1);
1609
1610             int level= block[0]*quant_coeff[qscale+18][0];
1611             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1612                 if(level>0){
1613                     level= (dc_bias + level)>>(QUANT_SHIFT-2);
1614                     block[0]= level;
1615                 }else{
1616                     level= (dc_bias - level)>>(QUANT_SHIFT-2);
1617                     block[0]= -level;
1618                 }
1619 //                last_non_zero = i;
1620             }else{
1621                 block[0]=0;
1622             }
1623         }else{
1624             const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
1625             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
1626             const unsigned int dc_threshold2= (dc_threshold1<<1);
1627
1628             int level= block[0]*quant_table[0];
1629             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1630                 if(level>0){
1631                     level= (dc_bias + level)>>(QUANT_SHIFT+1);
1632                     block[0]= level;
1633                 }else{
1634                     level= (dc_bias - level)>>(QUANT_SHIFT+1);
1635                     block[0]= -level;
1636                 }
1637 //                last_non_zero = i;
1638             }else{
1639                 block[0]=0;
1640             }
1641         }
1642         last_non_zero= 0;
1643         i=1;
1644     }else{
1645         last_non_zero= -1;
1646         i=0;
1647     }
1648
1649     for(; i<16; i++){
1650         const int j= scantable[i];
1651         int level= block[j]*quant_table[j];
1652
1653 //        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
1654 //           || bias-level >= (1<<(QMAT_SHIFT - 3))){
1655         if(((unsigned)(level+threshold1))>threshold2){
1656             if(level>0){
1657                 level= (bias + level)>>QUANT_SHIFT;
1658                 block[j]= level;
1659             }else{
1660                 level= (bias - level)>>QUANT_SHIFT;
1661                 block[j]= -level;
1662             }
1663             last_non_zero = i;
1664         }else{
1665             block[j]=0;
1666         }
1667     }
1668
1669     return last_non_zero;
1670 }
1671
1672 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
1673                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1674                            int src_x_offset, int src_y_offset,
1675                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
1676     MpegEncContext * const s = &h->s;
1677     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
1678     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
1679     const int luma_xy= (mx&3) + ((my&3)<<2);
1680     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
1681     uint8_t * src_cb, * src_cr;
1682     int extra_width= h->emu_edge_width;
1683     int extra_height= h->emu_edge_height;
1684     int emu=0;
1685     const int full_mx= mx>>2;
1686     const int full_my= my>>2;
1687     const int pic_width  = 16*s->mb_width;
1688     const int pic_height = 16*s->mb_height >> MB_FIELD;
1689
1690     if(!pic->data[0]) //FIXME this is unacceptable, some senseable error concealment must be done for missing reference frames
1691         return;
1692
1693     if(mx&7) extra_width -= 3;
1694     if(my&7) extra_height -= 3;
1695
1696     if(   full_mx < 0-extra_width
1697        || full_my < 0-extra_height
1698        || full_mx + 16/*FIXME*/ > pic_width + extra_width
1699        || full_my + 16/*FIXME*/ > pic_height + extra_height){
1700         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
1701             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
1702         emu=1;
1703     }
1704
1705     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
1706     if(!square){
1707         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
1708     }
1709
1710     if(ENABLE_GRAY && s->flags&CODEC_FLAG_GRAY) return;
1711
1712     if(MB_FIELD){
1713         // chroma offset when predicting from a field of opposite parity
1714         my += 2 * ((s->mb_y & 1) - (pic->reference - 1));
1715         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
1716     }
1717     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1718     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
1719
1720     if(emu){
1721         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1722             src_cb= s->edge_emu_buffer;
1723     }
1724     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1725
1726     if(emu){
1727         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
1728             src_cr= s->edge_emu_buffer;
1729     }
1730     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
1731 }
1732
1733 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
1734                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1735                            int x_offset, int y_offset,
1736                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1737                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1738                            int list0, int list1){
1739     MpegEncContext * const s = &h->s;
1740     qpel_mc_func *qpix_op=  qpix_put;
1741     h264_chroma_mc_func chroma_op= chroma_put;
1742
1743     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1744     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1745     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1746     x_offset += 8*s->mb_x;
1747     y_offset += 8*(s->mb_y >> MB_FIELD);
1748
1749     if(list0){
1750         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
1751         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
1752                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1753                            qpix_op, chroma_op);
1754
1755         qpix_op=  qpix_avg;
1756         chroma_op= chroma_avg;
1757     }
1758
1759     if(list1){
1760         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
1761         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
1762                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
1763                            qpix_op, chroma_op);
1764     }
1765 }
1766
1767 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
1768                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1769                            int x_offset, int y_offset,
1770                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1771                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
1772                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
1773                            int list0, int list1){
1774     MpegEncContext * const s = &h->s;
1775
1776     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
1777     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
1778     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
1779     x_offset += 8*s->mb_x;
1780     y_offset += 8*(s->mb_y >> MB_FIELD);
1781
1782     if(list0 && list1){
1783         /* don't optimize for luma-only case, since B-frames usually
1784          * use implicit weights => chroma too. */
1785         uint8_t *tmp_cb = s->obmc_scratchpad;
1786         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
1787         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
1788         int refn0 = h->ref_cache[0][ scan8[n] ];
1789         int refn1 = h->ref_cache[1][ scan8[n] ];
1790
1791         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
1792                     dest_y, dest_cb, dest_cr,
1793                     x_offset, y_offset, qpix_put, chroma_put);
1794         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
1795                     tmp_y, tmp_cb, tmp_cr,
1796                     x_offset, y_offset, qpix_put, chroma_put);
1797
1798         if(h->use_weight == 2){
1799             int weight0 = h->implicit_weight[refn0][refn1];
1800             int weight1 = 64 - weight0;
1801             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
1802             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
1803             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
1804         }else{
1805             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
1806                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
1807                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
1808             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1809                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
1810                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
1811             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1812                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
1813                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
1814         }
1815     }else{
1816         int list = list1 ? 1 : 0;
1817         int refn = h->ref_cache[list][ scan8[n] ];
1818         Picture *ref= &h->ref_list[list][refn];
1819         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
1820                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
1821                     qpix_put, chroma_put);
1822
1823         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
1824                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
1825         if(h->use_weight_chroma){
1826             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1827                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
1828             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
1829                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
1830         }
1831     }
1832 }
1833
1834 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
1835                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1836                            int x_offset, int y_offset,
1837                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
1838                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
1839                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
1840                            int list0, int list1){
1841     if((h->use_weight==2 && list0 && list1
1842         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
1843        || h->use_weight==1)
1844         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1845                          x_offset, y_offset, qpix_put, chroma_put,
1846                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
1847     else
1848         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
1849                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
1850 }
1851
1852 static inline void prefetch_motion(H264Context *h, int list){
1853     /* fetch pixels for estimated mv 4 macroblocks ahead
1854      * optimized for 64byte cache lines */
1855     MpegEncContext * const s = &h->s;
1856     const int refn = h->ref_cache[list][scan8[0]];
1857     if(refn >= 0){
1858         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
1859         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
1860         uint8_t **src= h->ref_list[list][refn].data;
1861         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
1862         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1863         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
1864         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1865     }
1866 }
1867
1868 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
1869                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
1870                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
1871                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
1872     MpegEncContext * const s = &h->s;
1873     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
1874     const int mb_type= s->current_picture.mb_type[mb_xy];
1875
1876     assert(IS_INTER(mb_type));
1877
1878     prefetch_motion(h, 0);
1879
1880     if(IS_16X16(mb_type)){
1881         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
1882                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
1883                 &weight_op[0], &weight_avg[0],
1884                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1885     }else if(IS_16X8(mb_type)){
1886         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
1887                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1888                 &weight_op[1], &weight_avg[1],
1889                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1890         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
1891                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
1892                 &weight_op[1], &weight_avg[1],
1893                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1894     }else if(IS_8X16(mb_type)){
1895         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
1896                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1897                 &weight_op[2], &weight_avg[2],
1898                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
1899         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
1900                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1901                 &weight_op[2], &weight_avg[2],
1902                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
1903     }else{
1904         int i;
1905
1906         assert(IS_8X8(mb_type));
1907
1908         for(i=0; i<4; i++){
1909             const int sub_mb_type= h->sub_mb_type[i];
1910             const int n= 4*i;
1911             int x_offset= (i&1)<<2;
1912             int y_offset= (i&2)<<1;
1913
1914             if(IS_SUB_8X8(sub_mb_type)){
1915                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1916                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
1917                     &weight_op[3], &weight_avg[3],
1918                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1919             }else if(IS_SUB_8X4(sub_mb_type)){
1920                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1921                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1922                     &weight_op[4], &weight_avg[4],
1923                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1924                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
1925                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
1926                     &weight_op[4], &weight_avg[4],
1927                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1928             }else if(IS_SUB_4X8(sub_mb_type)){
1929                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
1930                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1931                     &weight_op[5], &weight_avg[5],
1932                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1933                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
1934                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1935                     &weight_op[5], &weight_avg[5],
1936                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1937             }else{
1938                 int j;
1939                 assert(IS_SUB_4X4(sub_mb_type));
1940                 for(j=0; j<4; j++){
1941                     int sub_x_offset= x_offset + 2*(j&1);
1942                     int sub_y_offset= y_offset +   (j&2);
1943                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
1944                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
1945                         &weight_op[6], &weight_avg[6],
1946                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
1947                 }
1948             }
1949         }
1950     }
1951
1952     prefetch_motion(h, 1);
1953 }
1954
1955 static void decode_init_vlc(void){
1956     static int done = 0;
1957
1958     if (!done) {
1959         int i;
1960         done = 1;
1961
1962         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
1963                  &chroma_dc_coeff_token_len [0], 1, 1,
1964                  &chroma_dc_coeff_token_bits[0], 1, 1, 1);
1965
1966         for(i=0; i<4; i++){
1967             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
1968                      &coeff_token_len [i][0], 1, 1,
1969                      &coeff_token_bits[i][0], 1, 1, 1);
1970         }
1971
1972         for(i=0; i<3; i++){
1973             init_vlc(&chroma_dc_total_zeros_vlc[i], CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
1974                      &chroma_dc_total_zeros_len [i][0], 1, 1,
1975                      &chroma_dc_total_zeros_bits[i][0], 1, 1, 1);
1976         }
1977         for(i=0; i<15; i++){
1978             init_vlc(&total_zeros_vlc[i], TOTAL_ZEROS_VLC_BITS, 16,
1979                      &total_zeros_len [i][0], 1, 1,
1980                      &total_zeros_bits[i][0], 1, 1, 1);
1981         }
1982
1983         for(i=0; i<6; i++){
1984             init_vlc(&run_vlc[i], RUN_VLC_BITS, 7,
1985                      &run_len [i][0], 1, 1,
1986                      &run_bits[i][0], 1, 1, 1);
1987         }
1988         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
1989                  &run_len [6][0], 1, 1,
1990                  &run_bits[6][0], 1, 1, 1);
1991     }
1992 }
1993
1994 static void free_tables(H264Context *h){
1995     int i;
1996     H264Context *hx;
1997     av_freep(&h->intra4x4_pred_mode);
1998     av_freep(&h->chroma_pred_mode_table);
1999     av_freep(&h->cbp_table);
2000     av_freep(&h->mvd_table[0]);
2001     av_freep(&h->mvd_table[1]);
2002     av_freep(&h->direct_table);
2003     av_freep(&h->non_zero_count);
2004     av_freep(&h->slice_table_base);
2005     h->slice_table= NULL;
2006
2007     av_freep(&h->mb2b_xy);
2008     av_freep(&h->mb2b8_xy);
2009
2010     for(i = 0; i < MAX_SPS_COUNT; i++)
2011         av_freep(h->sps_buffers + i);
2012
2013     for(i = 0; i < MAX_PPS_COUNT; i++)
2014         av_freep(h->pps_buffers + i);
2015
2016     for(i = 0; i < h->s.avctx->thread_count; i++) {
2017         hx = h->thread_context[i];
2018         if(!hx) continue;
2019         av_freep(&hx->top_borders[1]);
2020         av_freep(&hx->top_borders[0]);
2021         av_freep(&hx->s.obmc_scratchpad);
2022     }
2023 }
2024
2025 static void init_dequant8_coeff_table(H264Context *h){
2026     int i,q,x;
2027     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2028     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2029     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2030
2031     for(i=0; i<2; i++ ){
2032         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2033             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2034             break;
2035         }
2036
2037         for(q=0; q<52; q++){
2038             int shift = ff_div6[q];
2039             int idx = ff_rem6[q];
2040             for(x=0; x<64; x++)
2041                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2042                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2043                     h->pps.scaling_matrix8[i][x]) << shift;
2044         }
2045     }
2046 }
2047
2048 static void init_dequant4_coeff_table(H264Context *h){
2049     int i,j,q,x;
2050     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2051     for(i=0; i<6; i++ ){
2052         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2053         for(j=0; j<i; j++){
2054             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2055                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2056                 break;
2057             }
2058         }
2059         if(j<i)
2060             continue;
2061
2062         for(q=0; q<52; q++){
2063             int shift = ff_div6[q] + 2;
2064             int idx = ff_rem6[q];
2065             for(x=0; x<16; x++)
2066                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
2067                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
2068                     h->pps.scaling_matrix4[i][x]) << shift;
2069         }
2070     }
2071 }
2072
2073 static void init_dequant_tables(H264Context *h){
2074     int i,x;
2075     init_dequant4_coeff_table(h);
2076     if(h->pps.transform_8x8_mode)
2077         init_dequant8_coeff_table(h);
2078     if(h->sps.transform_bypass){
2079         for(i=0; i<6; i++)
2080             for(x=0; x<16; x++)
2081                 h->dequant4_coeff[i][0][x] = 1<<6;
2082         if(h->pps.transform_8x8_mode)
2083             for(i=0; i<2; i++)
2084                 for(x=0; x<64; x++)
2085                     h->dequant8_coeff[i][0][x] = 1<<6;
2086     }
2087 }
2088
2089
2090 /**
2091  * allocates tables.
2092  * needs width/height
2093  */
2094 static int alloc_tables(H264Context *h){
2095     MpegEncContext * const s = &h->s;
2096     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2097     int x,y;
2098
2099     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2100
2101     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2102     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(uint8_t))
2103     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2104
2105     CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2106     CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2107     CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2108     CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2109
2110     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(uint8_t));
2111     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
2112
2113     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2114     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2115     for(y=0; y<s->mb_height; y++){
2116         for(x=0; x<s->mb_width; x++){
2117             const int mb_xy= x + y*s->mb_stride;
2118             const int b_xy = 4*x + 4*y*h->b_stride;
2119             const int b8_xy= 2*x + 2*y*h->b8_stride;
2120
2121             h->mb2b_xy [mb_xy]= b_xy;
2122             h->mb2b8_xy[mb_xy]= b8_xy;
2123         }
2124     }
2125
2126     s->obmc_scratchpad = NULL;
2127
2128     if(!h->dequant4_coeff[0])
2129         init_dequant_tables(h);
2130
2131     return 0;
2132 fail:
2133     free_tables(h);
2134     return -1;
2135 }
2136
2137 /**
2138  * Mimic alloc_tables(), but for every context thread.
2139  */
2140 static void clone_tables(H264Context *dst, H264Context *src){
2141     dst->intra4x4_pred_mode       = src->intra4x4_pred_mode;
2142     dst->non_zero_count           = src->non_zero_count;
2143     dst->slice_table              = src->slice_table;
2144     dst->cbp_table                = src->cbp_table;
2145     dst->mb2b_xy                  = src->mb2b_xy;
2146     dst->mb2b8_xy                 = src->mb2b8_xy;
2147     dst->chroma_pred_mode_table   = src->chroma_pred_mode_table;
2148     dst->mvd_table[0]             = src->mvd_table[0];
2149     dst->mvd_table[1]             = src->mvd_table[1];
2150     dst->direct_table             = src->direct_table;
2151
2152     dst->s.obmc_scratchpad = NULL;
2153     ff_h264_pred_init(&dst->hpc, src->s.codec_id);
2154 }
2155
2156 /**
2157  * Init context
2158  * Allocate buffers which are not shared amongst multiple threads.
2159  */
2160 static int context_init(H264Context *h){
2161     CHECKED_ALLOCZ(h->top_borders[0], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2162     CHECKED_ALLOCZ(h->top_borders[1], h->s.mb_width * (16+8+8) * sizeof(uint8_t))
2163
2164     return 0;
2165 fail:
2166     return -1; // free_tables will clean up for us
2167 }
2168
2169 static void common_init(H264Context *h){
2170     MpegEncContext * const s = &h->s;
2171
2172     s->width = s->avctx->width;
2173     s->height = s->avctx->height;
2174     s->codec_id= s->avctx->codec->id;
2175
2176     ff_h264_pred_init(&h->hpc, s->codec_id);
2177
2178     h->dequant_coeff_pps= -1;
2179     s->unrestricted_mv=1;
2180     s->decode=1; //FIXME
2181
2182     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
2183     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
2184 }
2185
2186 static int decode_init(AVCodecContext *avctx){
2187     H264Context *h= avctx->priv_data;
2188     MpegEncContext * const s = &h->s;
2189
2190     MPV_decode_defaults(s);
2191
2192     s->avctx = avctx;
2193     common_init(h);
2194
2195     s->out_format = FMT_H264;
2196     s->workaround_bugs= avctx->workaround_bugs;
2197
2198     // set defaults
2199 //    s->decode_mb= ff_h263_decode_mb;
2200     s->quarter_sample = 1;
2201     s->low_delay= 1;
2202     avctx->pix_fmt= PIX_FMT_YUV420P;
2203
2204     decode_init_vlc();
2205
2206     if(avctx->extradata_size > 0 && avctx->extradata &&
2207        *(char *)avctx->extradata == 1){
2208         h->is_avc = 1;
2209         h->got_avcC = 0;
2210     } else {
2211         h->is_avc = 0;
2212     }
2213
2214     h->thread_context[0] = h;
2215     return 0;
2216 }
2217
2218 static int frame_start(H264Context *h){
2219     MpegEncContext * const s = &h->s;
2220     int i;
2221
2222     if(MPV_frame_start(s, s->avctx) < 0)
2223         return -1;
2224     ff_er_frame_start(s);
2225     /*
2226      * MPV_frame_start uses pict_type to derive key_frame.
2227      * This is incorrect for H.264; IDR markings must be used.
2228      * Zero here; IDR markings per slice in frame or fields are OR'd in later.
2229      * See decode_nal_units().
2230      */
2231     s->current_picture_ptr->key_frame= 0;
2232
2233     assert(s->linesize && s->uvlinesize);
2234
2235     for(i=0; i<16; i++){
2236         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2237         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2238     }
2239     for(i=0; i<4; i++){
2240         h->block_offset[16+i]=
2241         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2242         h->block_offset[24+16+i]=
2243         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2244     }
2245
2246     /* can't be in alloc_tables because linesize isn't known there.
2247      * FIXME: redo bipred weight to not require extra buffer? */
2248     for(i = 0; i < s->avctx->thread_count; i++)
2249         if(!h->thread_context[i]->s.obmc_scratchpad)
2250             h->thread_context[i]->s.obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
2251
2252     /* some macroblocks will be accessed before they're available */
2253     if(FRAME_MBAFF || s->avctx->thread_count > 1)
2254         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(uint8_t));
2255
2256 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2257     return 0;
2258 }
2259
2260 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int simple){
2261     MpegEncContext * const s = &h->s;
2262     int i;
2263
2264     src_y  -=   linesize;
2265     src_cb -= uvlinesize;
2266     src_cr -= uvlinesize;
2267
2268     // There are two lines saved, the line above the the top macroblock of a pair,
2269     // and the line above the bottom macroblock
2270     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2271     for(i=1; i<17; i++){
2272         h->left_border[i]= src_y[15+i*  linesize];
2273     }
2274
2275     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2276     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2277
2278     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2279         h->left_border[17  ]= h->top_borders[0][s->mb_x][16+7];
2280         h->left_border[17+9]= h->top_borders[0][s->mb_x][24+7];
2281         for(i=1; i<9; i++){
2282             h->left_border[i+17  ]= src_cb[7+i*uvlinesize];
2283             h->left_border[i+17+9]= src_cr[7+i*uvlinesize];
2284         }
2285         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2286         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2287     }
2288 }
2289
2290 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg, int simple){
2291     MpegEncContext * const s = &h->s;
2292     int temp8, i;
2293     uint64_t temp64;
2294     int deblock_left;
2295     int deblock_top;
2296     int mb_xy;
2297
2298     if(h->deblocking_filter == 2) {
2299         mb_xy = s->mb_x + s->mb_y*s->mb_stride;
2300         deblock_left = h->slice_table[mb_xy] == h->slice_table[mb_xy - 1];
2301         deblock_top  = h->slice_table[mb_xy] == h->slice_table[h->top_mb_xy];
2302     } else {
2303         deblock_left = (s->mb_x > 0);
2304         deblock_top =  (s->mb_y > 0);
2305     }
2306
2307     src_y  -=   linesize + 1;
2308     src_cb -= uvlinesize + 1;
2309     src_cr -= uvlinesize + 1;
2310
2311 #define XCHG(a,b,t,xchg)\
2312 t= a;\
2313 if(xchg)\
2314     a= b;\
2315 b= t;
2316
2317     if(deblock_left){
2318         for(i = !deblock_top; i<17; i++){
2319             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2320         }
2321     }
2322
2323     if(deblock_top){
2324         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2325         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2326         if(s->mb_x+1 < s->mb_width){
2327             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2328         }
2329     }
2330
2331     if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2332         if(deblock_left){
2333             for(i = !deblock_top; i<9; i++){
2334                 XCHG(h->left_border[i+17  ], src_cb[i*uvlinesize], temp8, xchg);
2335                 XCHG(h->left_border[i+17+9], src_cr[i*uvlinesize], temp8, xchg);
2336             }
2337         }
2338         if(deblock_top){
2339             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2340             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2341         }
2342     }
2343 }
2344
2345 static inline void backup_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
2346     MpegEncContext * const s = &h->s;
2347     int i;
2348
2349     src_y  -= 2 *   linesize;
2350     src_cb -= 2 * uvlinesize;
2351     src_cr -= 2 * uvlinesize;
2352
2353     // There are two lines saved, the line above the the top macroblock of a pair,
2354     // and the line above the bottom macroblock
2355     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2356     h->left_border[1]= h->top_borders[1][s->mb_x][15];
2357     for(i=2; i<34; i++){
2358         h->left_border[i]= src_y[15+i*  linesize];
2359     }
2360
2361     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  32*linesize);
2362     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+32*linesize);
2363     *(uint64_t*)(h->top_borders[1][s->mb_x]+0)= *(uint64_t*)(src_y +  33*linesize);
2364     *(uint64_t*)(h->top_borders[1][s->mb_x]+8)= *(uint64_t*)(src_y +8+33*linesize);
2365
2366     if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2367         h->left_border[34     ]= h->top_borders[0][s->mb_x][16+7];
2368         h->left_border[34+   1]= h->top_borders[1][s->mb_x][16+7];
2369         h->left_border[34+18  ]= h->top_borders[0][s->mb_x][24+7];
2370         h->left_border[34+18+1]= h->top_borders[1][s->mb_x][24+7];
2371         for(i=2; i<18; i++){
2372             h->left_border[i+34   ]= src_cb[7+i*uvlinesize];
2373             h->left_border[i+34+18]= src_cr[7+i*uvlinesize];
2374         }
2375         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+16*uvlinesize);
2376         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+16*uvlinesize);
2377         *(uint64_t*)(h->top_borders[1][s->mb_x]+16)= *(uint64_t*)(src_cb+17*uvlinesize);
2378         *(uint64_t*)(h->top_borders[1][s->mb_x]+24)= *(uint64_t*)(src_cr+17*uvlinesize);
2379     }
2380 }
2381
2382 static inline void xchg_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
2383     MpegEncContext * const s = &h->s;
2384     int temp8, i;
2385     uint64_t temp64;
2386     int deblock_left = (s->mb_x > 0);
2387     int deblock_top  = (s->mb_y > 1);
2388
2389     tprintf(s->avctx, "xchg_pair_border: src_y:%p src_cb:%p src_cr:%p ls:%d uvls:%d\n", src_y, src_cb, src_cr, linesize, uvlinesize);
2390
2391     src_y  -= 2 *   linesize + 1;
2392     src_cb -= 2 * uvlinesize + 1;
2393     src_cr -= 2 * uvlinesize + 1;
2394
2395 #define XCHG(a,b,t,xchg)\
2396 t= a;\
2397 if(xchg)\
2398     a= b;\
2399 b= t;
2400
2401     if(deblock_left){
2402         for(i = (!deblock_top)<<1; i<34; i++){
2403             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2404         }
2405     }
2406
2407     if(deblock_top){
2408         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2409         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2410         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+0), *(uint64_t*)(src_y +1 +linesize), temp64, xchg);
2411         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+8), *(uint64_t*)(src_y +9 +linesize), temp64, 1);
2412         if(s->mb_x+1 < s->mb_width){
2413             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
2414             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x+1]), *(uint64_t*)(src_y +17 +linesize), temp64, 1);
2415         }
2416     }
2417
2418     if(!ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2419         if(deblock_left){
2420             for(i = (!deblock_top) << 1; i<18; i++){
2421                 XCHG(h->left_border[i+34   ], src_cb[i*uvlinesize], temp8, xchg);
2422                 XCHG(h->left_border[i+34+18], src_cr[i*uvlinesize], temp8, xchg);
2423             }
2424         }
2425         if(deblock_top){
2426             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2427             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2428             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+16), *(uint64_t*)(src_cb+1 +uvlinesize), temp64, 1);
2429             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+24), *(uint64_t*)(src_cr+1 +uvlinesize), temp64, 1);
2430         }
2431     }
2432 }
2433
2434 static av_always_inline void hl_decode_mb_internal(H264Context *h, int simple){
2435     MpegEncContext * const s = &h->s;
2436     const int mb_x= s->mb_x;
2437     const int mb_y= s->mb_y;
2438     const int mb_xy= mb_x + mb_y*s->mb_stride;
2439     const int mb_type= s->current_picture.mb_type[mb_xy];
2440     uint8_t  *dest_y, *dest_cb, *dest_cr;
2441     int linesize, uvlinesize /*dct_offset*/;
2442     int i;
2443     int *block_offset = &h->block_offset[0];
2444     const unsigned int bottom = mb_y & 1;
2445     const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass), is_h264 = (simple || s->codec_id == CODEC_ID_H264);
2446     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
2447     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
2448
2449     dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2450     dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2451     dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2452
2453     s->dsp.prefetch(dest_y + (s->mb_x&3)*4*s->linesize + 64, s->linesize, 4);
2454     s->dsp.prefetch(dest_cb + (s->mb_x&7)*s->uvlinesize + 64, dest_cr - dest_cb, 2);
2455
2456     if (!simple && MB_FIELD) {
2457         linesize   = h->mb_linesize   = s->linesize * 2;
2458         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
2459         block_offset = &h->block_offset[24];
2460         if(mb_y&1){ //FIXME move out of this func?
2461             dest_y -= s->linesize*15;
2462             dest_cb-= s->uvlinesize*7;
2463             dest_cr-= s->uvlinesize*7;
2464         }
2465         if(FRAME_MBAFF) {
2466             int list;
2467             for(list=0; list<h->list_count; list++){
2468                 if(!USES_LIST(mb_type, list))
2469                     continue;
2470                 if(IS_16X16(mb_type)){
2471                     int8_t *ref = &h->ref_cache[list][scan8[0]];
2472                     fill_rectangle(ref, 4, 4, 8, (16+*ref)^(s->mb_y&1), 1);
2473                 }else{
2474                     for(i=0; i<16; i+=4){
2475                         //FIXME can refs be smaller than 8x8 when !direct_8x8_inference ?
2476                         int ref = h->ref_cache[list][scan8[i]];
2477                         if(ref >= 0)
2478                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, (16+ref)^(s->mb_y&1), 1);
2479                     }
2480                 }
2481             }
2482         }
2483     } else {
2484         linesize   = h->mb_linesize   = s->linesize;
2485         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
2486 //        dct_offset = s->linesize * 16;
2487     }
2488
2489     if(transform_bypass){
2490         idct_dc_add =
2491         idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
2492     }else if(IS_8x8DCT(mb_type)){
2493         idct_dc_add = s->dsp.h264_idct8_dc_add;
2494         idct_add = s->dsp.h264_idct8_add;
2495     }else{
2496         idct_dc_add = s->dsp.h264_idct_dc_add;
2497         idct_add = s->dsp.h264_idct_add;
2498     }
2499
2500     if(!simple && FRAME_MBAFF && h->deblocking_filter && IS_INTRA(mb_type)
2501        && (!bottom || !IS_INTRA(s->current_picture.mb_type[mb_xy-s->mb_stride]))){
2502         int mbt_y = mb_y&~1;
2503         uint8_t *top_y  = s->current_picture.data[0] + (mbt_y * 16* s->linesize  ) + mb_x * 16;
2504         uint8_t *top_cb = s->current_picture.data[1] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
2505         uint8_t *top_cr = s->current_picture.data[2] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
2506         xchg_pair_border(h, top_y, top_cb, top_cr, s->linesize, s->uvlinesize, 1);
2507     }
2508
2509     if (!simple && IS_INTRA_PCM(mb_type)) {
2510         unsigned int x, y;
2511
2512         // The pixels are stored in h->mb array in the same order as levels,
2513         // copy them in output in the correct order.
2514         for(i=0; i<16; i++) {
2515             for (y=0; y<4; y++) {
2516                 for (x=0; x<4; x++) {
2517                     *(dest_y + block_offset[i] + y*linesize + x) = h->mb[i*16+y*4+x];
2518                 }
2519             }
2520         }
2521         for(i=16; i<16+4; i++) {
2522             for (y=0; y<4; y++) {
2523                 for (x=0; x<4; x++) {
2524                     *(dest_cb + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
2525                 }
2526             }
2527         }
2528         for(i=20; i<20+4; i++) {
2529             for (y=0; y<4; y++) {
2530                 for (x=0; x<4; x++) {
2531                     *(dest_cr + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
2532                 }
2533             }
2534         }
2535     } else {
2536         if(IS_INTRA(mb_type)){
2537             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
2538                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1, simple);
2539
2540             if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2541                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2542                 h->hpc.pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2543             }
2544
2545             if(IS_INTRA4x4(mb_type)){
2546                 if(simple || !s->encoding){
2547                     if(IS_8x8DCT(mb_type)){
2548                         for(i=0; i<16; i+=4){
2549                             uint8_t * const ptr= dest_y + block_offset[i];
2550                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2551                             const int nnz = h->non_zero_count_cache[ scan8[i] ];
2552                             h->hpc.pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
2553                                                    (h->topright_samples_available<<i)&0x4000, linesize);
2554                             if(nnz){
2555                                 if(nnz == 1 && h->mb[i*16])
2556                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2557                                 else
2558                                     idct_add(ptr, h->mb + i*16, linesize);
2559                             }
2560                         }
2561                     }else
2562                     for(i=0; i<16; i++){
2563                         uint8_t * const ptr= dest_y + block_offset[i];
2564                         uint8_t *topright;
2565                         const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2566                         int nnz, tr;
2567
2568                         if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2569                             const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2570                             assert(mb_y || linesize <= block_offset[i]);
2571                             if(!topright_avail){
2572                                 tr= ptr[3 - linesize]*0x01010101;
2573                                 topright= (uint8_t*) &tr;
2574                             }else
2575                                 topright= ptr + 4 - linesize;
2576                         }else
2577                             topright= NULL;
2578
2579                         h->hpc.pred4x4[ dir ](ptr, topright, linesize);
2580                         nnz = h->non_zero_count_cache[ scan8[i] ];
2581                         if(nnz){
2582                             if(is_h264){
2583                                 if(nnz == 1 && h->mb[i*16])
2584                                     idct_dc_add(ptr, h->mb + i*16, linesize);
2585                                 else
2586                                     idct_add(ptr, h->mb + i*16, linesize);
2587                             }else
2588                                 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
2589                         }
2590                     }
2591                 }
2592             }else{
2593                 h->hpc.pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
2594                 if(is_h264){
2595                     if(!transform_bypass)
2596                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[0][s->qscale][0]);
2597                 }else
2598                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
2599             }
2600             if(h->deblocking_filter && (simple || !FRAME_MBAFF))
2601                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0, simple);
2602         }else if(is_h264){
2603             hl_motion(h, dest_y, dest_cb, dest_cr,
2604                       s->me.qpel_put, s->dsp.put_h264_chroma_pixels_tab,
2605                       s->me.qpel_avg, s->dsp.avg_h264_chroma_pixels_tab,
2606                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
2607         }
2608
2609
2610         if(!IS_INTRA4x4(mb_type)){
2611             if(is_h264){
2612                 if(IS_INTRA16x16(mb_type)){
2613                     for(i=0; i<16; i++){
2614                         if(h->non_zero_count_cache[ scan8[i] ])
2615                             idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2616                         else if(h->mb[i*16])
2617                             idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2618                     }
2619                 }else{
2620                     const int di = IS_8x8DCT(mb_type) ? 4 : 1;
2621                     for(i=0; i<16; i+=di){
2622                         int nnz = h->non_zero_count_cache[ scan8[i] ];
2623                         if(nnz){
2624                             if(nnz==1 && h->mb[i*16])
2625                                 idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2626                             else
2627                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
2628                         }
2629                     }
2630                 }
2631             }else{
2632                 for(i=0; i<16; i++){
2633                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
2634                         uint8_t * const ptr= dest_y + block_offset[i];
2635                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
2636                     }
2637                 }
2638             }
2639         }
2640
2641         if(simple || !ENABLE_GRAY || !(s->flags&CODEC_FLAG_GRAY)){
2642             uint8_t *dest[2] = {dest_cb, dest_cr};
2643             if(transform_bypass){
2644                 idct_add = idct_dc_add = s->dsp.add_pixels4;
2645             }else{
2646                 idct_add = s->dsp.h264_idct_add;
2647                 idct_dc_add = s->dsp.h264_idct_dc_add;
2648                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp[0], h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp[0]][0]);
2649                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp[1], h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp[1]][0]);
2650             }
2651             if(is_h264){
2652                 for(i=16; i<16+8; i++){
2653                     if(h->non_zero_count_cache[ scan8[i] ])
2654                         idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2655                     else if(h->mb[i*16])
2656                         idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
2657                 }
2658             }else{
2659                 for(i=16; i<16+8; i++){
2660                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
2661                         uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
2662                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
2663                     }
2664                 }
2665             }
2666         }
2667     }
2668     if(h->deblocking_filter) {
2669         if (!simple && FRAME_MBAFF) {
2670             //FIXME try deblocking one mb at a time?
2671             // the reduction in load/storing mvs and such might outweigh the extra backup/xchg_border
2672             const int mb_y = s->mb_y - 1;
2673             uint8_t  *pair_dest_y, *pair_dest_cb, *pair_dest_cr;
2674             const int mb_xy= mb_x + mb_y*s->mb_stride;
2675             const int mb_type_top   = s->current_picture.mb_type[mb_xy];
2676             const int mb_type_bottom= s->current_picture.mb_type[mb_xy+s->mb_stride];
2677             if (!bottom) return;
2678             pair_dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2679             pair_dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2680             pair_dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2681
2682             if(IS_INTRA(mb_type_top | mb_type_bottom))
2683                 xchg_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize, 0);
2684
2685             backup_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize);
2686             // deblock a pair
2687             // top
2688             s->mb_y--;
2689             tprintf(h->s.avctx, "call mbaff filter_mb mb_x:%d mb_y:%d pair_dest_y = %p, dest_y = %p\n", mb_x, mb_y, pair_dest_y, dest_y);
2690             fill_caches(h, mb_type_top, 1); //FIXME don't fill stuff which isn't used by filter_mb
2691             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy]);
2692             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy]);
2693             filter_mb(h, mb_x, mb_y, pair_dest_y, pair_dest_cb, pair_dest_cr, linesize, uvlinesize);
2694             // bottom
2695             s->mb_y++;
2696             tprintf(h->s.avctx, "call mbaff filter_mb\n");
2697             fill_caches(h, mb_type_bottom, 1); //FIXME don't fill stuff which isn't used by filter_mb
2698             h->chroma_qp[0] = get_chroma_qp(h, 0, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
2699             h->chroma_qp[1] = get_chroma_qp(h, 1, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
2700             filter_mb(h, mb_x, mb_y+1, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2701         } else {
2702             tprintf(h->s.avctx, "call filter_mb\n");
2703             backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, simple);
2704             fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
2705             filter_mb_fast(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
2706         }
2707     }
2708 }
2709
2710 /**
2711  * Process a macroblock; this case avoids checks for expensive uncommon cases.
2712  */
2713 static void hl_decode_mb_simple(H264Context *h){
2714     hl_decode_mb_internal(h, 1);
2715 }
2716
2717 /**
2718  * Process a macroblock; this handles edge cases, such as interlacing.
2719  */
2720 static void av_noinline hl_decode_mb_complex(H264Context *h){
2721     hl_decode_mb_internal(h, 0);
2722 }
2723
2724 static void hl_decode_mb(H264Context *h){
2725     MpegEncContext * const s = &h->s;
2726     const int mb_x= s->mb_x;
2727     const int mb_y= s->mb_y;
2728     const int mb_xy= mb_x + mb_y*s->mb_stride;
2729     const int mb_type= s->current_picture.mb_type[mb_xy];
2730     int is_complex = FRAME_MBAFF || MB_FIELD || IS_INTRA_PCM(mb_type) || s->codec_id != CODEC_ID_H264 || (ENABLE_GRAY && (s->flags&CODEC_FLAG_GRAY)) || s->encoding;
2731
2732     if(!s->decode)
2733         return;
2734
2735     if (is_complex)
2736         hl_decode_mb_complex(h);
2737     else hl_decode_mb_simple(h);
2738 }
2739
2740 static void pic_as_field(Picture *pic, const int parity){
2741     int i;
2742     for (i = 0; i < 4; ++i) {
2743         if (parity == PICT_BOTTOM_FIELD)
2744             pic->data[i] += pic->linesize[i];
2745         pic->reference = parity;
2746         pic->linesize[i] *= 2;
2747     }
2748 }
2749
2750 static int split_field_copy(Picture *dest, Picture *src,
2751                             int parity, int id_add){
2752     int match = !!(src->reference & parity);
2753
2754     if (match) {
2755         *dest = *src;
2756         pic_as_field(dest, parity);
2757         dest->pic_id *= 2;
2758         dest->pic_id += id_add;
2759     }
2760
2761     return match;
2762 }
2763
2764 /**
2765  * Split one reference list into field parts, interleaving by parity
2766  * as per H.264 spec section 8.2.4.2.5. Output fields have their data pointers
2767  * set to look at the actual start of data for that field.
2768  *
2769  * @param dest output list
2770  * @param dest_len maximum number of fields to put in dest
2771  * @param src the source reference list containing fields and/or field pairs
2772  *            (aka short_ref/long_ref, or
2773  *             refFrameListXShortTerm/refFrameListLongTerm in spec-speak)
2774  * @param src_len number of Picture's in source (pairs and unmatched fields)
2775  * @param parity the parity of the picture being decoded/needing
2776  *        these ref pics (PICT_{TOP,BOTTOM}_FIELD)
2777  * @return number of fields placed in dest
2778  */
2779 static int split_field_half_ref_list(Picture *dest, int dest_len,
2780                                      Picture *src,  int src_len,  int parity){
2781     int same_parity   = 1;
2782     int same_i        = 0;
2783     int opp_i         = 0;
2784     int out_i;
2785     int field_output;
2786
2787     for (out_i = 0; out_i < dest_len; out_i += field_output) {
2788         if (same_parity && same_i < src_len) {
2789             field_output = split_field_copy(dest + out_i, src + same_i,
2790                                             parity, 1);
2791             same_parity = !field_output;
2792             same_i++;
2793
2794         } else if (opp_i < src_len) {
2795             field_output = split_field_copy(dest + out_i, src + opp_i,
2796                                             PICT_FRAME - parity, 0);
2797             same_parity = field_output;
2798             opp_i++;
2799
2800         } else {
2801             break;
2802         }
2803     }
2804
2805     return out_i;
2806 }
2807
2808 /**
2809  * Split the reference frame list into a reference field list.
2810  * This implements H.264 spec 8.2.4.2.5 for a combined input list.
2811  * The input list contains both reference field pairs and
2812  * unmatched reference fields; it is ordered as spec describes
2813  * RefPicListX for frames in 8.2.4.2.1 and 8.2.4.2.3, except that
2814  * unmatched field pairs are also present. Conceptually this is equivalent
2815  * to concatenation of refFrameListXShortTerm with refFrameListLongTerm.
2816  *
2817  * @param dest output reference list where ordered fields are to be placed
2818  * @param dest_len max number of fields to place at dest
2819  * @param src source reference list, as described above
2820  * @param src_len number of pictures (pairs and unmatched fields) in src
2821  * @param parity parity of field being currently decoded
2822  *        (one of PICT_{TOP,BOTTOM}_FIELD)
2823  * @param long_i index into src array that holds first long reference picture,
2824  *        or src_len if no long refs present.
2825  */
2826 static int split_field_ref_list(Picture *dest, int dest_len,
2827                                 Picture *src,  int src_len,
2828                                 int parity,    int long_i){
2829
2830     int i = split_field_half_ref_list(dest, dest_len, src, long_i, parity);
2831     dest += i;
2832     dest_len -= i;
2833
2834     i += split_field_half_ref_list(dest, dest_len, src + long_i,
2835                                    src_len - long_i, parity);
2836     return i;
2837 }
2838
2839 /**
2840  * fills the default_ref_list.
2841  */
2842 static int fill_default_ref_list(H264Context *h){
2843     MpegEncContext * const s = &h->s;
2844     int i;
2845     int smallest_poc_greater_than_current = -1;
2846     int structure_sel;
2847     Picture sorted_short_ref[32];
2848     Picture field_entry_list[2][32];
2849     Picture *frame_list[2];
2850
2851     if (FIELD_PICTURE) {
2852         structure_sel = PICT_FRAME;
2853         frame_list[0] = field_entry_list[0];
2854         frame_list[1] = field_entry_list[1];
2855     } else {
2856         structure_sel = 0;
2857         frame_list[0] = h->default_ref_list[0];
2858         frame_list[1] = h->default_ref_list[1];
2859     }
2860
2861     if(h->slice_type==FF_B_TYPE){
2862         int list;
2863         int len[2];
2864         int short_len[2];
2865         int out_i;
2866         int limit= INT_MIN;
2867
2868         /* sort frame according to poc in B slice */
2869         for(out_i=0; out_i<h->short_ref_count; out_i++){
2870             int best_i=INT_MIN;
2871             int best_poc=INT_MAX;
2872
2873             for(i=0; i<h->short_ref_count; i++){
2874                 const int poc= h->short_ref[i]->poc;
2875                 if(poc > limit && poc < best_poc){
2876                     best_poc= poc;
2877                     best_i= i;
2878                 }
2879             }
2880
2881             assert(best_i != INT_MIN);
2882
2883             limit= best_poc;
2884             sorted_short_ref[out_i]= *h->short_ref[best_i];
2885             tprintf(h->s.avctx, "sorted poc: %d->%d poc:%d fn:%d\n", best_i, out_i, sorted_short_ref[out_i].poc, sorted_short_ref[out_i].frame_num);
2886             if (-1 == smallest_poc_greater_than_current) {
2887                 if (h->short_ref[best_i]->poc >= s->current_picture_ptr->poc) {
2888                     smallest_poc_greater_than_current = out_i;
2889                 }
2890             }
2891         }
2892
2893         tprintf(h->s.avctx, "current poc: %d, smallest_poc_greater_than_current: %d\n", s->current_picture_ptr->poc, smallest_poc_greater_than_current);
2894
2895         // find the largest poc
2896         for(list=0; list<2; list++){
2897             int index = 0;
2898             int j= -99;
2899             int step= list ? -1 : 1;
2900
2901             for(i=0; i<h->short_ref_count && index < h->ref_count[list]; i++, j+=step) {
2902                 int sel;
2903                 while(j<0 || j>= h->short_ref_count){
2904                     if(j != -99 && step == (list ? -1 : 1))
2905                         return -1;
2906                     step = -step;
2907                     j= smallest_poc_greater_than_current + (step>>1);
2908                 }
2909                 sel = sorted_short_ref[j].reference | structure_sel;
2910                 if(sel != PICT_FRAME) continue;
2911                 frame_list[list][index  ]= sorted_short_ref[j];
2912                 frame_list[list][index++].pic_id= sorted_short_ref[j].frame_num;
2913             }
2914             short_len[list] = index;
2915
2916             for(i = 0; i < 16 && index < h->ref_count[ list ]; i++){
2917                 int sel;
2918                 if(h->long_ref[i] == NULL) continue;
2919                 sel = h->long_ref[i]->reference | structure_sel;
2920                 if(sel != PICT_FRAME) continue;
2921
2922                 frame_list[ list ][index  ]= *h->long_ref[i];
2923                 frame_list[ list ][index++].pic_id= i;
2924             }
2925             len[list] = index;
2926         }
2927
2928         for(list=0; list<2; list++){
2929             if (FIELD_PICTURE)
2930                 len[list] = split_field_ref_list(h->default_ref_list[list],
2931                                                  h->ref_count[list],
2932                                                  frame_list[list],
2933                                                  len[list],
2934                                                  s->picture_structure,
2935                                                  short_len[list]);
2936
2937             // swap the two first elements of L1 when L0 and L1 are identical
2938             if(list && len[0] > 1 && len[0] == len[1])
2939                 for(i=0; h->default_ref_list[0][i].data[0] == h->default_ref_list[1][i].data[0]; i++)
2940                     if(i == len[0]){
2941                         FFSWAP(Picture, h->default_ref_list[1][0], h->default_ref_list[1][1]);
2942                         break;
2943                     }
2944
2945             if(len[list] < h->ref_count[ list ])
2946                 memset(&h->default_ref_list[list][len[list]], 0, sizeof(Picture)*(h->ref_count[ list ] - len[list]));
2947         }
2948
2949
2950     }else{
2951         int index=0;
2952         int short_len;
2953         for(i=0; i<h->short_ref_count; i++){
2954             int sel;
2955             sel = h->short_ref[i]->reference | structure_sel;
2956             if(sel != PICT_FRAME) continue;
2957             frame_list[0][index  ]= *h->short_ref[i];
2958             frame_list[0][index++].pic_id= h->short_ref[i]->frame_num;
2959         }
2960         short_len = index;
2961         for(i = 0; i < 16; i++){
2962             int sel;
2963             if(h->long_ref[i] == NULL) continue;
2964             sel = h->long_ref[i]->reference | structure_sel;
2965             if(sel != PICT_FRAME) continue;
2966             frame_list[0][index  ]= *h->long_ref[i];
2967             frame_list[0][index++].pic_id= i;
2968         }
2969
2970         if (FIELD_PICTURE)
2971             index = split_field_ref_list(h->default_ref_list[0],
2972                                          h->ref_count[0], frame_list[0],
2973                                          index, s->picture_structure,
2974                                          short_len);
2975
2976         if(index < h->ref_count[0])
2977             memset(&h->default_ref_list[0][index], 0, sizeof(Picture)*(h->ref_count[0] - index));
2978     }
2979 #ifdef TRACE
2980     for (i=0; i<h->ref_count[0]; i++) {
2981         tprintf(h->s.avctx, "List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
2982     }
2983     if(h->slice_type==FF_B_TYPE){
2984         for (i=0; i<h->ref_count[1]; i++) {
2985             tprintf(h->s.avctx, "List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[1][i].data[0]);
2986         }
2987     }
2988 #endif
2989     return 0;
2990 }
2991
2992 static void print_short_term(H264Context *h);
2993 static void print_long_term(H264Context *h);
2994
2995 /**
2996  * Extract structure information about the picture described by pic_num in
2997  * the current decoding context (frame or field). Note that pic_num is
2998  * picture number without wrapping (so, 0<=pic_num<max_pic_num).
2999  * @param pic_num picture number for which to extract structure information
3000  * @param structure one of PICT_XXX describing structure of picture
3001  *                      with pic_num
3002  * @return frame number (short term) or long term index of picture
3003  *         described by pic_num
3004  */
3005 static int pic_num_extract(H264Context *h, int pic_num, int *structure){
3006     MpegEncContext * const s = &h->s;
3007
3008     *structure = s->picture_structure;
3009     if(FIELD_PICTURE){
3010         if (!(pic_num & 1))
3011             /* opposite field */
3012             *structure ^= PICT_FRAME;
3013         pic_num >>= 1;
3014     }
3015
3016     return pic_num;
3017 }
3018
3019 static int decode_ref_pic_list_reordering(H264Context *h){
3020     MpegEncContext * const s = &h->s;
3021     int list, index, pic_structure;
3022
3023     print_short_term(h);
3024     print_long_term(h);
3025     if(h->slice_type==FF_I_TYPE || h->slice_type==FF_SI_TYPE) return 0; //FIXME move before func
3026
3027     for(list=0; list<h->list_count; list++){
3028         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
3029
3030         if(get_bits1(&s->gb)){
3031             int pred= h->curr_pic_num;
3032
3033             for(index=0; ; index++){
3034                 unsigned int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
3035                 unsigned int pic_id;
3036                 int i;
3037                 Picture *ref = NULL;
3038
3039                 if(reordering_of_pic_nums_idc==3)
3040                     break;
3041
3042                 if(index >= h->ref_count[list]){
3043                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
3044                     return -1;
3045                 }
3046
3047                 if(reordering_of_pic_nums_idc<3){
3048                     if(reordering_of_pic_nums_idc<2){
3049                         const unsigned int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
3050                         int frame_num;
3051
3052                         if(abs_diff_pic_num > h->max_pic_num){
3053                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
3054                             return -1;
3055                         }
3056
3057                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
3058                         else                                pred+= abs_diff_pic_num;
3059                         pred &= h->max_pic_num - 1;
3060
3061                         frame_num = pic_num_extract(h, pred, &pic_structure);
3062
3063                         for(i= h->short_ref_count-1; i>=0; i--){
3064                             ref = h->short_ref[i];
3065                             assert(ref->reference);
3066                             assert(!ref->long_ref);
3067                             if(ref->data[0] != NULL &&
3068                                    ref->frame_num == frame_num &&
3069                                    (ref->reference & pic_structure) &&
3070                                    ref->long_ref == 0) // ignore non existing pictures by testing data[0] pointer
3071                                 break;
3072                         }
3073                         if(i>=0)
3074                             ref->pic_id= pred;
3075                     }else{
3076                         int long_idx;
3077                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
3078
3079                         long_idx= pic_num_extract(h, pic_id, &pic_structure);
3080
3081                         if(long_idx>31){
3082                             av_log(h->s.avctx, AV_LOG_ERROR, "long_term_pic_idx overflow\n");
3083                             return -1;
3084                         }
3085                         ref = h->long_ref[long_idx];
3086                         assert(!(ref && !ref->reference));
3087                         if(ref && (ref->reference & pic_structure)){
3088                             ref->pic_id= pic_id;
3089                             assert(ref->long_ref);
3090                             i=0;
3091                         }else{
3092                             i=-1;
3093                         }
3094                     }
3095
3096                     if (i < 0) {
3097                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
3098                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
3099                     } else {
3100                         for(i=index; i+1<h->ref_count[list]; i++){
3101                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
3102                                 break;
3103                         }
3104                         for(; i > index; i--){
3105                             h->ref_list[list][i]= h->ref_list[list][i-1];
3106                         }
3107                         h->ref_list[list][index]= *ref;
3108                         if (FIELD_PICTURE){
3109                             pic_as_field(&h->ref_list[list][index], pic_structure);
3110                         }
3111                     }
3112                 }else{
3113                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
3114                     return -1;
3115                 }
3116             }
3117         }
3118     }
3119     for(list=0; list<h->list_count; list++){
3120         for(index= 0; index < h->ref_count[list]; index++){
3121             if(!h->ref_list[list][index].data[0])
3122                 h->ref_list[list][index]= s->current_picture;
3123         }
3124     }
3125
3126     if(h->slice_type==FF_B_TYPE && !h->direct_spatial_mv_pred)
3127         direct_dist_scale_factor(h);
3128     direct_ref_list_init(h);
3129     return 0;
3130 }
3131
3132 static void fill_mbaff_ref_list(H264Context *h){
3133     int list, i, j;
3134     for(list=0; list<2; list++){ //FIXME try list_count
3135         for(i=0; i<h->ref_count[list]; i++){
3136             Picture *frame = &h->ref_list[list][i];
3137             Picture *field = &h->ref_list[list][16+2*i];
3138             field[0] = *frame;
3139             for(j=0; j<3; j++)
3140                 field[0].linesize[j] <<= 1;
3141             field[0].reference = PICT_TOP_FIELD;
3142             field[1] = field[0];
3143             for(j=0; j<3; j++)
3144                 field[1].data[j] += frame->linesize[j];
3145             field[1].reference = PICT_BOTTOM_FIELD;
3146
3147             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
3148             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
3149             for(j=0; j<2; j++){
3150                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
3151                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
3152             }
3153         }
3154     }
3155     for(j=0; j<h->ref_count[1]; j++){
3156         for(i=0; i<h->ref_count[0]; i++)
3157             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
3158         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
3159         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
3160     }
3161 }
3162
3163 static int pred_weight_table(H264Context *h){
3164     MpegEncContext * const s = &h->s;
3165     int list, i;
3166     int luma_def, chroma_def;
3167
3168     h->use_weight= 0;
3169     h->use_weight_chroma= 0;
3170     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
3171     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
3172     luma_def = 1<<h->luma_log2_weight_denom;
3173     chroma_def = 1<<h->chroma_log2_weight_denom;
3174
3175     for(list=0; list<2; list++){
3176         for(i=0; i<h->ref_count[list]; i++){
3177             int luma_weight_flag, chroma_weight_flag;
3178
3179             luma_weight_flag= get_bits1(&s->gb);
3180             if(luma_weight_flag){
3181                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3182                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3183                 if(   h->luma_weight[list][i] != luma_def
3184                    || h->luma_offset[list][i] != 0)
3185                     h->use_weight= 1;
3186             }else{
3187                 h->luma_weight[list][i]= luma_def;
3188                 h->luma_offset[list][i]= 0;
3189             }
3190
3191             chroma_weight_flag= get_bits1(&s->gb);
3192             if(chroma_weight_flag){
3193                 int j;
3194                 for(j=0; j<2; j++){
3195                     h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3196                     h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3197                     if(   h->chroma_weight[list][i][j] != chroma_def
3198                        || h->chroma_offset[list][i][j] != 0)
3199                         h->use_weight_chroma= 1;
3200                 }
3201             }else{
3202                 int j;
3203                 for(j=0; j<2; j++){
3204                     h->chroma_weight[list][i][j]= chroma_def;
3205                     h->chroma_offset[list][i][j]= 0;
3206                 }
3207             }
3208         }
3209         if(h->slice_type != FF_B_TYPE) break;
3210     }
3211     h->use_weight= h->use_weight || h->use_weight_chroma;
3212     return 0;
3213 }
3214
3215 static void implicit_weight_table(H264Context *h){
3216     MpegEncContext * const s = &h->s;
3217     int ref0, ref1;
3218     int cur_poc = s->current_picture_ptr->poc;
3219
3220     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3221        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3222         h->use_weight= 0;
3223         h->use_weight_chroma= 0;
3224         return;
3225     }
3226
3227     h->use_weight= 2;
3228     h->use_weight_chroma= 2;
3229     h->luma_log2_weight_denom= 5;
3230     h->chroma_log2_weight_denom= 5;
3231
3232     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3233         int poc0 = h->ref_list[0][ref0].poc;
3234         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3235             int poc1 = h->ref_list[1][ref1].poc;
3236             int td = av_clip(poc1 - poc0, -128, 127);
3237             if(td){
3238                 int tb = av_clip(cur_poc - poc0, -128, 127);
3239                 int tx = (16384 + (FFABS(td) >> 1)) / td;
3240                 int dist_scale_factor = av_clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3241                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3242                     h->implicit_weight[ref0][ref1] = 32;
3243                 else
3244                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3245             }else
3246                 h->implicit_weight[ref0][ref1] = 32;
3247         }
3248     }
3249 }
3250
3251 /**
3252  * Mark a picture as no longer needed for reference. The refmask
3253  * argument allows unreferencing of individual fields or the whole frame.
3254  * If the picture becomes entirely unreferenced, but is being held for
3255  * display purposes, it is marked as such.
3256  * @param refmask mask of fields to unreference; the mask is bitwise
3257  *                anded with the reference marking of pic
3258  * @return non-zero if pic becomes entirely unreferenced (except possibly
3259  *         for display purposes) zero if one of the fields remains in
3260  *         reference
3261  */
3262 static inline int unreference_pic(H264Context *h, Picture *pic, int refmask){
3263     int i;
3264     if (pic->reference &= refmask) {
3265         return 0;
3266     } else {
3267         if(pic == h->delayed_output_pic)
3268             pic->reference=DELAYED_PIC_REF;
3269         else{
3270             for(i = 0; h->delayed_pic[i]; i++)
3271                 if(pic == h->delayed_pic[i]){
3272                     pic->reference=DELAYED_PIC_REF;
3273                     break;
3274                 }
3275         }
3276         return 1;
3277     }
3278 }
3279
3280 /**
3281  * instantaneous decoder refresh.
3282  */
3283 static void idr(H264Context *h){
3284     int i;
3285
3286     for(i=0; i<16; i++){
3287         if (h->long_ref[i] != NULL) {
3288             unreference_pic(h, h->long_ref[i], 0);
3289             h->long_ref[i]= NULL;
3290         }
3291     }
3292     h->long_ref_count=0;
3293
3294     for(i=0; i<h->short_ref_count; i++){
3295         unreference_pic(h, h->short_ref[i], 0);
3296         h->short_ref[i]= NULL;
3297     }
3298     h->short_ref_count=0;
3299 }
3300
3301 /* forget old pics after a seek */
3302 static void flush_dpb(AVCodecContext *avctx){
3303     H264Context *h= avctx->priv_data;
3304     int i;
3305     for(i=0; i<16; i++) {
3306         if(h->delayed_pic[i])
3307             h->delayed_pic[i]->reference= 0;
3308         h->delayed_pic[i]= NULL;
3309     }
3310     if(h->delayed_output_pic)
3311         h->delayed_output_pic->reference= 0;
3312     h->delayed_output_pic= NULL;
3313     idr(h);
3314     if(h->s.current_picture_ptr)
3315         h->s.current_picture_ptr->reference= 0;
3316     h->s.first_field= 0;
3317     ff_mpeg_flush(avctx);
3318 }
3319
3320 /**
3321  * Find a Picture in the short term reference list by frame number.
3322  * @param frame_num frame number to search for
3323  * @param idx the index into h->short_ref where returned picture is found
3324  *            undefined if no picture found.
3325  * @return pointer to the found picture, or NULL if no pic with the provided
3326  *                 frame number is found
3327  */
3328 static Picture * find_short(H264Context *h, int frame_num, int *idx){
3329     MpegEncContext * const s = &h->s;
3330     int i;
3331
3332     for(i=0; i<h->short_ref_count; i++){
3333         Picture *pic= h->short_ref[i];
3334         if(s->avctx->debug&FF_DEBUG_MMCO)
3335             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3336         if(pic->frame_num == frame_num) {
3337             *idx = i;
3338             return pic;
3339         }
3340     }
3341     return NULL;
3342 }
3343
3344 /**
3345  * Remove a picture from the short term reference list by its index in
3346  * that list.  This does no checking on the provided index; it is assumed
3347  * to be valid. Other list entries are shifted down.
3348  * @param i index into h->short_ref of picture to remove.
3349  */
3350 static void remove_short_at_index(H264Context *h, int i){
3351     assert(i > 0 && i < h->short_ref_count);
3352     h->short_ref[i]= NULL;
3353     if (--h->short_ref_count)
3354         memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i)*sizeof(Picture*));
3355 }
3356
3357 /**
3358  *
3359  * @return the removed picture or NULL if an error occurs
3360  */
3361 static Picture * remove_short(H264Context *h, int frame_num){
3362     MpegEncContext * const s = &h->s;
3363     Picture *pic;
3364     int i;
3365
3366     if(s->avctx->debug&FF_DEBUG_MMCO)
3367         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3368
3369     pic = find_short(h, frame_num, &i);
3370     if (pic)
3371         remove_short_at_index(h, i);
3372
3373     return pic;
3374 }
3375
3376 /**
3377  * Remove a picture from the long term reference list by its index in
3378  * that list.  This does no checking on the provided index; it is assumed
3379  * to be valid. The removed entry is set to NULL. Other entries are unaffected.
3380  * @param i index into h->long_ref of picture to remove.
3381  */
3382 static void remove_long_at_index(H264Context *h, int i){
3383     h->long_ref[i]= NULL;
3384     h->long_ref_count--;
3385 }
3386
3387 /**
3388  *
3389  * @return the removed picture or NULL if an error occurs
3390  */
3391 static Picture * remove_long(H264Context *h, int i){
3392     Picture *pic;
3393
3394     pic= h->long_ref[i];
3395     if (pic)
3396         remove_long_at_index(h, i);
3397
3398     return pic;
3399 }
3400
3401 /**
3402  * print short term list
3403  */
3404 static void print_short_term(H264Context *h) {
3405     uint32_t i;
3406     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3407         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3408         for(i=0; i<h->short_ref_count; i++){
3409             Picture *pic= h->short_ref[i];
3410             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3411         }
3412     }
3413 }
3414
3415 /**
3416  * print long term list
3417  */
3418 static void print_long_term(H264Context *h) {
3419     uint32_t i;
3420     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3421         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3422         for(i = 0; i < 16; i++){
3423             Picture *pic= h->long_ref[i];
3424             if (pic) {
3425                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3426             }
3427         }
3428     }
3429 }
3430
3431 /**
3432  * Executes the reference picture marking (memory management control operations).
3433  */
3434 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3435     MpegEncContext * const s = &h->s;
3436     int i, j;
3437     int current_ref_assigned=0;
3438     Picture *pic;
3439
3440     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3441         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3442
3443     for(i=0; i<mmco_count; i++){
3444         int structure, frame_num, unref_pic;
3445         if(s->avctx->debug&FF_DEBUG_MMCO)
3446             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_pic_num, h->mmco[i].long_arg);
3447
3448         switch(mmco[i].opcode){
3449         case MMCO_SHORT2UNUSED:
3450             if(s->avctx->debug&FF_DEBUG_MMCO)
3451                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short %d count %d\n", h->mmco[i].short_pic_num, h->short_ref_count);
3452             frame_num = pic_num_extract(h, mmco[i].short_pic_num, &structure);
3453             pic = find_short(h, frame_num, &j);
3454             if (pic) {
3455                 if (unreference_pic(h, pic, structure ^ PICT_FRAME))
3456                     remove_short_at_index(h, j);
3457             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3458                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref short failure\n");
3459             break;
3460         case MMCO_SHORT2LONG:
3461             if (FIELD_PICTURE && mmco[i].long_arg < h->long_ref_count &&
3462                     h->long_ref[mmco[i].long_arg]->frame_num ==
3463                                               mmco[i].short_pic_num / 2) {
3464                 /* do nothing, we've already moved this field pair. */
3465             } else {
3466                 int frame_num = mmco[i].short_pic_num >> FIELD_PICTURE;
3467
3468                 pic= remove_long(h, mmco[i].long_arg);
3469                 if(pic) unreference_pic(h, pic, 0);
3470
3471                 h->long_ref[ mmco[i].long_arg ]= remove_short(h, frame_num);
3472                 if (h->long_ref[ mmco[i].long_arg ]){
3473                     h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3474                     h->long_ref_count++;
3475                 }
3476             }
3477             break;
3478         case MMCO_LONG2UNUSED:
3479             j = pic_num_extract(h, mmco[i].long_arg, &structure);
3480             pic = h->long_ref[j];
3481             if (pic) {
3482                 if (unreference_pic(h, pic, structure ^ PICT_FRAME))
3483                     remove_long_at_index(h, j);
3484             } else if(s->avctx->debug&FF_DEBUG_MMCO)
3485                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: unref long failure\n");
3486             break;
3487         case MMCO_LONG:
3488             unref_pic = 1;
3489             if (FIELD_PICTURE && !s->first_field) {
3490                 if (h->long_ref[mmco[i].long_arg] == s->current_picture_ptr) {
3491                     /* Just mark second field as referenced */
3492                     unref_pic = 0;
3493                 } else if (s->current_picture_ptr->reference) {
3494                     /* First field in pair is in short term list or
3495                      * at a different long term index.
3496                      * This is not allowed; see 7.4.3, notes 2 and 3.
3497                      * Report the problem and keep the pair where it is,
3498                      * and mark this field valid.
3499                      */
3500                     av_log(h->s.avctx, AV_LOG_ERROR,
3501                         "illegal long term reference assignment for second "
3502                         "field in complementary field pair (first field is "
3503                         "short term or has non-matching long index)\n");
3504                     unref_pic = 0;
3505                 }
3506             }
3507
3508             if (unref_pic) {
3509                 pic= remove_long(h, mmco[i].long_arg);
3510                 if(pic) unreference_pic(h, pic, 0);
3511
3512                 h->long_ref[ mmco[i].long_arg ]= s->current_picture_ptr;
3513                 h->long_ref[ mmco[i].long_arg ]->long_ref=1;
3514                 h->long_ref_count++;
3515             }
3516
3517             s->current_picture_ptr->reference |= s->picture_structure;
3518             current_ref_assigned=1;
3519             break;
3520         case MMCO_SET_MAX_LONG:
3521             assert(mmco[i].long_arg <= 16);
3522             // just remove the long term which index is greater than new max
3523             for(j = mmco[i].long_arg; j<16; j++){
3524                 pic = remove_long(h, j);
3525                 if (pic) unreference_pic(h, pic, 0);
3526             }
3527             break;
3528         case MMCO_RESET:
3529             while(h->short_ref_count){
3530                 pic= remove_short(h, h->short_ref[0]->frame_num);
3531                 if(pic) unreference_pic(h, pic, 0);
3532             }
3533             for(j = 0; j < 16; j++) {
3534                 pic= remove_long(h, j);
3535                 if(pic) unreference_pic(h, pic, 0);
3536             }
3537             break;
3538         default: assert(0);
3539         }
3540     }
3541
3542     if (!current_ref_assigned && FIELD_PICTURE &&
3543             !s->first_field && s->current_picture_ptr->reference) {
3544
3545         /* Second field of complementary field pair; the first field of
3546          * which is already referenced. If short referenced, it
3547          * should be first entry in short_ref. If not, it must exist
3548          * in long_ref; trying to put it on the short list here is an
3549          * error in the encoded bit stream (ref: 7.4.3, NOTE 2 and 3).
3550          */
3551         if (h->short_ref_count && h->short_ref[0] == s->current_picture_ptr) {
3552             /* Just mark the second field valid */
3553             s->current_picture_ptr->reference = PICT_FRAME;
3554         } else if (s->current_picture_ptr->long_ref) {
3555             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term reference "
3556                                              "assignment for second field "
3557                                              "in complementary field pair "
3558                                              "(first field is long term)\n");
3559         } else {
3560             /*
3561              * First field in reference, but not in any sensible place on our
3562              * reference lists. This shouldn't happen unless reference
3563              * handling somewhere else is wrong.
3564              */
3565             assert(0);
3566         }
3567         current_ref_assigned = 1;
3568     }
3569
3570     if(!current_ref_assigned){
3571         pic= remove_short(h, s->current_picture_ptr->frame_num);
3572         if(pic){
3573             unreference_pic(h, pic, 0);
3574             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3575         }
3576
3577         if(h->short_ref_count)
3578             memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3579
3580         h->short_ref[0]= s->current_picture_ptr;
3581         h->short_ref[0]->long_ref=0;
3582         h->short_ref_count++;
3583         s->current_picture_ptr->reference |= s->picture_structure;
3584     }
3585
3586     if (h->long_ref_count + h->short_ref_count > h->sps.ref_frame_count){
3587
3588         /* We have too many reference frames, probably due to corrupted
3589          * stream. Need to discard one frame. Prevents overrun of the
3590          * short_ref and long_ref buffers.
3591          */
3592         av_log(h->s.avctx, AV_LOG_ERROR,
3593                "number of reference frames exceeds max (probably "
3594                "corrupt input), discarding one\n");
3595
3596         if (h->long_ref_count) {
3597             for (i = 0; i < 16; ++i)
3598                 if (h->long_ref[i])
3599                     break;
3600
3601             assert(i < 16);
3602             pic = h->long_ref[i];
3603             remove_long_at_index(h, i);
3604         } else {
3605             pic = h->short_ref[h->short_ref_count - 1];
3606             remove_short_at_index(h, h->short_ref_count - 1);
3607         }
3608         unreference_pic(h, pic, 0);
3609     }
3610
3611     print_short_term(h);
3612     print_long_term(h);
3613     return 0;
3614 }
3615
3616 static int decode_ref_pic_marking(H264Context *h, GetBitContext *gb){
3617     MpegEncContext * const s = &h->s;
3618     int i;
3619
3620     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3621         s->broken_link= get_bits1(gb) -1;
3622         h->mmco[0].long_arg= get_bits1(gb) - 1; // current_long_term_idx
3623         if(h->mmco[0].long_arg == -1)
3624             h->mmco_index= 0;
3625         else{
3626             h->mmco[0].opcode= MMCO_LONG;
3627             h->mmco_index= 1;
3628         }
3629     }else{
3630         if(get_bits1(gb)){ // adaptive_ref_pic_marking_mode_flag
3631             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3632                 MMCOOpcode opcode= get_ue_golomb(gb);
3633
3634                 h->mmco[i].opcode= opcode;
3635                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3636                     h->mmco[i].short_pic_num= (h->curr_pic_num - get_ue_golomb(gb) - 1) & (h->max_pic_num - 1);
3637 /*                    if(h->mmco[i].short_pic_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_pic_num ] == NULL){
3638                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
3639                         return -1;
3640                     }*/
3641                 }
3642                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3643                     unsigned int long_arg= get_ue_golomb(gb);
3644                     if(long_arg >= 32 || (long_arg >= 16 && !(opcode == MMCO_LONG2UNUSED && FIELD_PICTURE))){
3645                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3646                         return -1;
3647                     }
3648                     h->mmco[i].long_arg= long_arg;
3649                 }
3650
3651                 if(opcode > (unsigned)MMCO_LONG){
3652                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3653                     return -1;
3654                 }
3655                 if(opcode == MMCO_END)
3656                     break;
3657             }
3658             h->mmco_index= i;
3659         }else{
3660             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3661
3662             if(h->short_ref_count && h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count &&
3663                     !(FIELD_PICTURE && !s->first_field && s->current_picture_ptr->reference)) {
3664                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3665                 h->mmco[0].short_pic_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3666                 h->mmco_index= 1;
3667                 if (FIELD_PICTURE) {
3668                     h->mmco[0].short_pic_num *= 2;
3669                     h->mmco[1].opcode= MMCO_SHORT2UNUSED;
3670                     h->mmco[1].short_pic_num= h->mmco[0].short_pic_num + 1;
3671                     h->mmco_index= 2;
3672                 }
3673             }else
3674                 h->mmco_index= 0;
3675         }
3676     }
3677
3678     return 0;
3679 }
3680
3681 static int init_poc(H264Context *h){
3682     MpegEncContext * const s = &h->s;
3683     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3684     int field_poc[2];
3685
3686     if(h->nal_unit_type == NAL_IDR_SLICE){
3687         h->frame_num_offset= 0;
3688     }else{
3689         if(h->frame_num < h->prev_frame_num)
3690             h->frame_num_offset= h->prev_frame_num_offset + max_frame_num;
3691         else
3692             h->frame_num_offset= h->prev_frame_num_offset;
3693     }
3694
3695     if(h->sps.poc_type==0){
3696         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3697
3698         if(h->nal_unit_type == NAL_IDR_SLICE){
3699              h->prev_poc_msb=
3700              h->prev_poc_lsb= 0;
3701         }
3702
3703         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3704             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3705         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3706             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3707         else
3708             h->poc_msb = h->prev_poc_msb;
3709 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3710         field_poc[0] =
3711         field_poc[1] = h->poc_msb + h->poc_lsb;
3712         if(s->picture_structure == PICT_FRAME)
3713             field_poc[1] += h->delta_poc_bottom;
3714     }else if(h->sps.poc_type==1){
3715         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3716         int i;
3717
3718         if(h->sps.poc_cycle_length != 0)
3719             abs_frame_num = h->frame_num_offset + h->frame_num;
3720         else
3721             abs_frame_num = 0;
3722
3723         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3724             abs_frame_num--;
3725
3726         expected_delta_per_poc_cycle = 0;
3727         for(i=0; i < h->sps.poc_cycle_length; i++)
3728             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3729
3730         if(abs_frame_num > 0){
3731             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3732             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3733
3734             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3735             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3736                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3737         } else
3738             expectedpoc = 0;
3739
3740         if(h->nal_ref_idc == 0)
3741             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3742
3743         field_poc[0] = expectedpoc + h->delta_poc[0];
3744         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3745
3746         if(s->picture_structure == PICT_FRAME)
3747             field_poc[1] += h->delta_poc[1];
3748     }else{
3749         int poc;
3750         if(h->nal_unit_type == NAL_IDR_SLICE){
3751             poc= 0;
3752         }else{
3753             if(h->nal_ref_idc) poc= 2*(h->frame_num_offset + h->frame_num);
3754             else               poc= 2*(h->frame_num_offset + h->frame_num) - 1;
3755         }
3756         field_poc[0]= poc;
3757         field_poc[1]= poc;
3758     }
3759
3760     if(s->picture_structure != PICT_BOTTOM_FIELD) {
3761         s->current_picture_ptr->field_poc[0]= field_poc[0];
3762         s->current_picture_ptr->poc = field_poc[0];
3763     }
3764     if(s->picture_structure != PICT_TOP_FIELD) {
3765         s->current_picture_ptr->field_poc[1]= field_poc[1];
3766         s->current_picture_ptr->poc = field_poc[1];
3767     }
3768     if(!FIELD_PICTURE || !s->first_field) {
3769         Picture *cur = s->current_picture_ptr;
3770         cur->poc= FFMIN(cur->field_poc[0], cur->field_poc[1]);
3771     }
3772
3773     return 0;
3774 }
3775
3776
3777 /**
3778  * initialize scan tables
3779  */
3780 static void init_scan_tables(H264Context *h){
3781     MpegEncContext * const s = &h->s;
3782     int i;
3783     if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
3784         memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
3785         memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
3786     }else{
3787         for(i=0; i<16; i++){
3788 #define T(x) (x>>2) | ((x<<2) & 0xF)
3789             h->zigzag_scan[i] = T(zigzag_scan[i]);
3790             h-> field_scan[i] = T( field_scan[i]);
3791 #undef T
3792         }
3793     }
3794     if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
3795         memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
3796         memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
3797         memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
3798         memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
3799     }else{
3800         for(i=0; i<64; i++){
3801 #define T(x) (x>>3) | ((x&7)<<3)
3802             h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
3803             h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
3804             h->field_scan8x8[i]        = T(field_scan8x8[i]);
3805             h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
3806 #undef T
3807         }
3808     }
3809     if(h->sps.transform_bypass){ //FIXME same ugly
3810         h->zigzag_scan_q0          = zigzag_scan;
3811         h->zigzag_scan8x8_q0       = zigzag_scan8x8;
3812         h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
3813         h->field_scan_q0           = field_scan;
3814         h->field_scan8x8_q0        = field_scan8x8;
3815         h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
3816     }else{
3817         h->zigzag_scan_q0          = h->zigzag_scan;
3818         h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
3819         h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
3820         h->field_scan_q0           = h->field_scan;
3821         h->field_scan8x8_q0        = h->field_scan8x8;
3822         h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
3823     }
3824 }
3825
3826 /**
3827  * Replicates H264 "master" context to thread contexts.
3828  */
3829 static void clone_slice(H264Context *dst, H264Context *src)
3830 {
3831     memcpy(dst->block_offset,     src->block_offset, sizeof(dst->block_offset));
3832     dst->s.current_picture_ptr  = src->s.current_picture_ptr;
3833     dst->s.current_picture      = src->s.current_picture;
3834     dst->s.linesize             = src->s.linesize;
3835     dst->s.uvlinesize           = src->s.uvlinesize;
3836     dst->s.first_field          = src->s.first_field;
3837
3838     dst->prev_poc_msb           = src->prev_poc_msb;
3839     dst->prev_poc_lsb           = src->prev_poc_lsb;
3840     dst->prev_frame_num_offset  = src->prev_frame_num_offset;
3841     dst->prev_frame_num         = src->prev_frame_num;
3842     dst->short_ref_count        = src->short_ref_count;
3843
3844     memcpy(dst->short_ref,        src->short_ref,        sizeof(dst->short_ref));
3845     memcpy(dst->long_ref,         src->long_ref,         sizeof(dst->long_ref));
3846     memcpy(dst->default_ref_list, src->default_ref_list, sizeof(dst->default_ref_list));
3847     memcpy(dst->ref_list,         src->ref_list,         sizeof(dst->ref_list));
3848
3849     memcpy(dst->dequant4_coeff,   src->dequant4_coeff,   sizeof(src->dequant4_coeff));
3850     memcpy(dst->dequant8_coeff,   src->dequant8_coeff,   sizeof(src->dequant8_coeff));
3851 }
3852
3853 /**
3854  * decodes a slice header.
3855  * this will allso call MPV_common_init() and frame_start() as needed
3856  *
3857  * @param h h264context
3858  * @param h0 h264 master context (differs from 'h' when doing sliced based parallel decoding)
3859  *
3860  * @return 0 if okay, <0 if an error occured, 1 if decoding must not be multithreaded
3861  */
3862 static int decode_slice_header(H264Context *h, H264Context *h0){
3863     MpegEncContext * const s = &h->s;
3864     MpegEncContext * const s0 = &h0->s;
3865     unsigned int first_mb_in_slice;
3866     unsigned int pps_id;
3867     int num_ref_idx_active_override_flag;
3868     static const uint8_t slice_type_map[5]= {FF_P_TYPE, FF_B_TYPE, FF_I_TYPE, FF_SP_TYPE, FF_SI_TYPE};
3869     unsigned int slice_type, tmp, i;
3870     int default_ref_list_done = 0;
3871     int last_pic_structure;
3872
3873     s->dropable= h->nal_ref_idc == 0;
3874
3875     if((s->avctx->flags2 & CODEC_FLAG2_FAST) && !h->nal_ref_idc){
3876         s->me.qpel_put= s->dsp.put_2tap_qpel_pixels_tab;
3877         s->me.qpel_avg= s->dsp.avg_2tap_qpel_pixels_tab;
3878     }else{
3879         s->me.qpel_put= s->dsp.put_h264_qpel_pixels_tab;
3880         s->me.qpel_avg= s->dsp.avg_h264_qpel_pixels_tab;
3881     }
3882
3883     first_mb_in_slice= get_ue_golomb(&s->gb);
3884
3885     if((s->flags2 & CODEC_FLAG2_CHUNKS) && first_mb_in_slice == 0){
3886         h0->current_slice = 0;
3887         if (!s0->first_field)
3888             s->current_picture_ptr= NULL;
3889     }
3890
3891     slice_type= get_ue_golomb(&s->gb);
3892     if(slice_type > 9){
3893         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3894         return -1;
3895     }
3896     if(slice_type > 4){
3897         slice_type -= 5;
3898         h->slice_type_fixed=1;
3899     }else
3900         h->slice_type_fixed=0;
3901
3902     slice_type= slice_type_map[ slice_type ];
3903     if (slice_type == FF_I_TYPE
3904         || (h0->current_slice != 0 && slice_type == h0->last_slice_type) ) {
3905         default_ref_list_done = 1;
3906     }
3907     h->slice_type= slice_type;
3908
3909     s->pict_type= h->slice_type; // to make a few old func happy, it's wrong though
3910     if (s->pict_type == FF_B_TYPE && s0->last_picture_ptr == NULL) {
3911         av_log(h->s.avctx, AV_LOG_ERROR,
3912                "B picture before any references, skipping\n");
3913         return -1;
3914     }
3915
3916     pps_id= get_ue_golomb(&s->gb);
3917     if(pps_id>=MAX_PPS_COUNT){
3918         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3919         return -1;
3920     }
3921     if(!h0->pps_buffers[pps_id]) {
3922         av_log(h->s.avctx, AV_LOG_ERROR, "non existing PPS referenced\n");
3923         return -1;
3924     }
3925     h->pps= *h0->pps_buffers[pps_id];
3926
3927     if(!h0->sps_buffers[h->pps.sps_id]) {
3928         av_log(h->s.avctx, AV_LOG_ERROR, "non existing SPS referenced\n");
3929         return -1;
3930     }
3931     h->sps = *h0->sps_buffers[h->pps.sps_id];
3932
3933     if(h == h0 && h->dequant_coeff_pps != pps_id){
3934         h->dequant_coeff_pps = pps_id;
3935         init_dequant_tables(h);
3936     }
3937
3938     s->mb_width= h->sps.mb_width;
3939     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3940
3941     h->b_stride=  s->mb_width*4;
3942     h->b8_stride= s->mb_width*2;
3943
3944     s->width = 16*s->mb_width - 2*(h->sps.crop_left + h->sps.crop_right );
3945     if(h->sps.frame_mbs_only_flag)
3946         s->height= 16*s->mb_height - 2*(h->sps.crop_top  + h->sps.crop_bottom);
3947     else
3948         s->height= 16*s->mb_height - 4*(h->sps.crop_top  + h->sps.crop_bottom); //FIXME recheck
3949
3950     if (s->context_initialized
3951         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3952         if(h != h0)
3953             return -1;   // width / height changed during parallelized decoding
3954         free_tables(h);
3955         MPV_common_end(s);
3956     }
3957     if (!s->context_initialized) {
3958         if(h != h0)
3959             return -1;  // we cant (re-)initialize context during parallel decoding
3960         if (MPV_common_init(s) < 0)
3961             return -1;
3962         s->first_field = 0;
3963
3964         init_scan_tables(h);
3965         alloc_tables(h);
3966
3967         for(i = 1; i < s->avctx->thread_count; i++) {
3968             H264Context *c;
3969             c = h->thread_context[i] = av_malloc(sizeof(H264Context));
3970             memcpy(c, h->s.thread_context[i], sizeof(MpegEncContext));
3971             memset(&c->s + 1, 0, sizeof(H264Context) - sizeof(MpegEncContext));
3972             c->sps = h->sps;
3973             c->pps = h->pps;
3974             init_scan_tables(c);
3975             clone_tables(c, h);
3976         }
3977
3978         for(i = 0; i < s->avctx->thread_count; i++)
3979             if(context_init(h->thread_context[i]) < 0)
3980                 return -1;
3981
3982         s->avctx->width = s->width;
3983         s->avctx->height = s->height;
3984         s->avctx->sample_aspect_ratio= h->sps.sar;
3985         if(!s->avctx->sample_aspect_ratio.den)
3986             s->avctx->sample_aspect_ratio.den = 1;
3987
3988         if(h->sps.timing_info_present_flag){
3989             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
3990             if(h->x264_build > 0 && h->x264_build < 44)
3991                 s->avctx->time_base.den *= 2;
3992             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
3993                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
3994         }
3995     }
3996
3997     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3998
3999     h->mb_mbaff = 0;
4000     h->mb_aff_frame = 0;
4001     last_pic_structure = s0->picture_structure;
4002     if(h->sps.frame_mbs_only_flag){
4003         s->picture_structure= PICT_FRAME;
4004     }else{
4005         if(get_bits1(&s->gb)) { //field_pic_flag
4006             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
4007         } else {
4008             s->picture_structure= PICT_FRAME;
4009             h->mb_aff_frame = h->sps.mb_aff;
4010         }
4011     }
4012
4013     if(h0->current_slice == 0){
4014         /* See if we have a decoded first field looking for a pair... */
4015         if (s0->first_field) {
4016             assert(s0->current_picture_ptr);
4017             assert(s0->current_picture_ptr->data[0]);
4018             assert(s0->current_picture_ptr->reference != DELAYED_PIC_REF);
4019
4020             /* figure out if we have a complementary field pair */
4021             if (!FIELD_PICTURE || s->picture_structure == last_pic_structure) {
4022                 /*
4023                  * Previous field is unmatched. Don't display it, but let it
4024                  * remain for reference if marked as such.
4025                  */
4026                 s0->current_picture_ptr = NULL;
4027                 s0->first_field = FIELD_PICTURE;
4028
4029             } else {
4030                 if (h->nal_ref_idc &&
4031                         s0->current_picture_ptr->reference &&
4032                         s0->current_picture_ptr->frame_num != h->frame_num) {
4033                     /*
4034                      * This and previous field were reference, but had
4035                      * different frame_nums. Consider this field first in
4036                      * pair. Throw away previous field except for reference
4037                      * purposes.
4038                      */
4039                     s0->first_field = 1;
4040                     s0->current_picture_ptr = NULL;
4041
4042                 } else {
4043                     /* Second field in complementary pair */
4044                     s0->first_field = 0;
4045                 }
4046             }
4047
4048         } else {
4049             /* Frame or first field in a potentially complementary pair */
4050             assert(!s0->current_picture_ptr);
4051             s0->first_field = FIELD_PICTURE;
4052         }
4053
4054         if((!FIELD_PICTURE || s0->first_field) && frame_start(h) < 0) {
4055             s0->first_field = 0;
4056             return -1;
4057         }
4058     }
4059     if(h != h0)
4060         clone_slice(h, h0);
4061
4062     s->current_picture_ptr->frame_num= h->frame_num; //FIXME frame_num cleanup
4063
4064     assert(s->mb_num == s->mb_width * s->mb_height);
4065     if(first_mb_in_slice << FIELD_OR_MBAFF_PICTURE >= s->mb_num ||
4066        first_mb_in_slice                    >= s->mb_num){
4067         av_log(h->s.avctx, AV_LOG_ERROR, "first_mb_in_slice overflow\n");
4068         return -1;
4069     }
4070     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
4071     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << FIELD_OR_MBAFF_PICTURE;
4072     if (s->picture_structure == PICT_BOTTOM_FIELD)
4073         s->resync_mb_y = s->mb_y = s->mb_y + 1;
4074     assert(s->mb_y < s->mb_height);
4075
4076     if(s->picture_structure==PICT_FRAME){
4077         h->curr_pic_num=   h->frame_num;
4078         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
4079     }else{
4080         h->curr_pic_num= 2*h->frame_num + 1;
4081         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
4082     }
4083
4084     if(h->nal_unit_type == NAL_IDR_SLICE){
4085         get_ue_golomb(&s->gb); /* idr_pic_id */
4086     }
4087
4088     if(h->sps.poc_type==0){
4089         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
4090
4091         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
4092             h->delta_poc_bottom= get_se_golomb(&s->gb);
4093         }
4094     }
4095
4096     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
4097         h->delta_poc[0]= get_se_golomb(&s->gb);
4098
4099         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
4100             h->delta_poc[1]= get_se_golomb(&s->gb);
4101     }
4102
4103     init_poc(h);
4104
4105     if(h->pps.redundant_pic_cnt_present){
4106         h->redundant_pic_count= get_ue_golomb(&s->gb);
4107     }
4108
4109     //set defaults, might be overriden a few line later
4110     h->ref_count[0]= h->pps.ref_count[0];
4111     h->ref_count[1]= h->pps.ref_count[1];
4112
4113     if(h->slice_type == FF_P_TYPE || h->slice_type == FF_SP_TYPE || h->slice_type == FF_B_TYPE){
4114         if(h->slice_type == FF_B_TYPE){
4115             h->direct_spatial_mv_pred= get_bits1(&s->gb);
4116         }
4117         num_ref_idx_active_override_flag= get_bits1(&s->gb);
4118
4119         if(num_ref_idx_active_override_flag){
4120             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
4121             if(h->slice_type==FF_B_TYPE)
4122                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
4123
4124             if(h->ref_count[0]-1 > 32-1 || h->ref_count[1]-1 > 32-1){
4125                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
4126                 h->ref_count[0]= h->ref_count[1]= 1;
4127                 return -1;
4128             }
4129         }
4130         if(h->slice_type == FF_B_TYPE)
4131             h->list_count= 2;
4132         else
4133             h->list_count= 1;
4134     }else
4135         h->list_count= 0;
4136
4137     if(!default_ref_list_done){
4138         fill_default_ref_list(h);
4139     }
4140
4141     if(decode_ref_pic_list_reordering(h) < 0)
4142         return -1;
4143
4144     if(   (h->pps.weighted_pred          && (h->slice_type == FF_P_TYPE || h->slice_type == FF_SP_TYPE ))
4145        || (h->pps.weighted_bipred_idc==1 && h->slice_type==FF_B_TYPE ) )
4146         pred_weight_table(h);
4147     else if(h->pps.weighted_bipred_idc==2 && h->slice_type==FF_B_TYPE)
4148         implicit_weight_table(h);
4149     else
4150         h->use_weight = 0;
4151
4152     if(h->nal_ref_idc)
4153         decode_ref_pic_marking(h0, &s->gb);
4154
4155     if(FRAME_MBAFF)
4156         fill_mbaff_ref_list(h);
4157
4158     if( h->slice_type != FF_I_TYPE && h->slice_type != FF_SI_TYPE && h->pps.cabac ){
4159         tmp = get_ue_golomb(&s->gb);
4160         if(tmp > 2){
4161             av_log(s->avctx, AV_LOG_ERROR, "cabac_init_idc overflow\n");
4162             return -1;
4163         }
4164         h->cabac_init_idc= tmp;
4165     }
4166
4167     h->last_qscale_diff = 0;
4168     tmp = h->pps.init_qp + get_se_golomb(&s->gb);
4169     if(tmp>51){
4170         av_log(s->avctx, AV_LOG_ERROR, "QP %u out of range\n", tmp);
4171         return -1;
4172     }
4173     s->qscale= tmp;
4174     h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
4175     h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
4176     //FIXME qscale / qp ... stuff
4177     if(h->slice_type == FF_SP_TYPE){
4178         get_bits1(&s->gb); /* sp_for_switch_flag */
4179     }
4180     if(h->slice_type==FF_SP_TYPE || h->slice_type == FF_SI_TYPE){
4181         get_se_golomb(&s->gb); /* slice_qs_delta */
4182     }
4183
4184     h->deblocking_filter = 1;
4185     h->slice_alpha_c0_offset = 0;
4186     h->slice_beta_offset = 0;
4187     if( h->pps.deblocking_filter_parameters_present ) {
4188         tmp= get_ue_golomb(&s->gb);
4189         if(tmp > 2){
4190             av_log(s->avctx, AV_LOG_ERROR, "deblocking_filter_idc %u out of range\n", tmp);
4191             return -1;
4192         }
4193         h->deblocking_filter= tmp;
4194         if(h->deblocking_filter < 2)
4195             h->deblocking_filter^= 1; // 1<->0
4196
4197         if( h->deblocking_filter ) {
4198             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4199             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4200         }
4201     }
4202
4203     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4204        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type != FF_I_TYPE)
4205        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type == FF_B_TYPE)
4206        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4207         h->deblocking_filter= 0;
4208
4209     if(h->deblocking_filter == 1 && h0->max_contexts > 1) {
4210         if(s->avctx->flags2 & CODEC_FLAG2_FAST) {
4211             /* Cheat slightly for speed:
4212                Do not bother to deblock across slices. */
4213             h->deblocking_filter = 2;
4214         } else {
4215             h0->max_contexts = 1;
4216             if(!h0->single_decode_warning) {
4217                 av_log(s->avctx, AV_LOG_INFO, "Cannot parallelize deblocking type 1, decoding such frames in sequential order\n");
4218                 h0->single_decode_warning = 1;
4219             }
4220             if(h != h0)
4221                 return 1; // deblocking switched inside frame
4222         }
4223     }
4224
4225 #if 0 //FMO
4226     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4227         slice_group_change_cycle= get_bits(&s->gb, ?);
4228 #endif
4229
4230     h0->last_slice_type = slice_type;
4231     h->slice_num = ++h0->current_slice;
4232
4233     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4234     h->emu_edge_height= (FRAME_MBAFF || FIELD_PICTURE) ? 0 : h->emu_edge_width;
4235
4236     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4237         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c pps:%u frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s\n",
4238                h->slice_num,
4239                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4240                first_mb_in_slice,
4241                av_get_pict_type_char(h->slice_type),
4242                pps_id, h->frame_num,
4243                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4244                h->ref_count[0], h->ref_count[1],
4245                s->qscale,
4246                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4247                h->use_weight,
4248                h->use_weight==1 && h->use_weight_chroma ? "c" : ""
4249                );
4250     }
4251
4252     return 0;
4253 }
4254
4255 /**
4256  *
4257  */
4258 static inline int get_level_prefix(GetBitContext *gb){
4259     unsigned int buf;
4260     int log;
4261
4262     OPEN_READER(re, gb);
4263     UPDATE_CACHE(re, gb);
4264     buf=GET_CACHE(re, gb);
4265
4266     log= 32 - av_log2(buf);
4267 #ifdef TRACE
4268     print_bin(buf>>(32-log), log);
4269     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4270 #endif
4271
4272     LAST_SKIP_BITS(re, gb, log);
4273     CLOSE_READER(re, gb);
4274
4275     return log-1;
4276 }
4277
4278 static inline int get_dct8x8_allowed(H264Context *h){
4279     int i;
4280     for(i=0; i<4; i++){
4281         if(!IS_SUB_8X8(h->sub_mb_type[i])
4282            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4283             return 0;
4284     }
4285     return 1;
4286 }
4287
4288 /**
4289  * decodes a residual block.
4290  * @param n block index
4291  * @param scantable scantable
4292  * @param max_coeff number of coefficients in the block
4293  * @return <0 if an error occured
4294  */
4295 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4296     MpegEncContext * const s = &h->s;
4297     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4298     int level[16];
4299     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4300
4301     //FIXME put trailing_onex into the context
4302
4303     if(n == CHROMA_DC_BLOCK_INDEX){
4304         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4305         total_coeff= coeff_token>>2;
4306     }else{
4307         if(n == LUMA_DC_BLOCK_INDEX){
4308             total_coeff= pred_non_zero_count(h, 0);
4309             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4310             total_coeff= coeff_token>>2;
4311         }else{
4312             total_coeff= pred_non_zero_count(h, n);
4313             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4314             total_coeff= coeff_token>>2;
4315             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4316         }
4317     }
4318
4319     //FIXME set last_non_zero?
4320
4321     if(total_coeff==0)
4322         return 0;
4323     if(total_coeff > (unsigned)max_coeff) {
4324         av_log(h->s.avctx, AV_LOG_ERROR, "corrupted macroblock %d %d (total_coeff=%d)\n", s->mb_x, s->mb_y, total_coeff);
4325         return -1;
4326     }
4327
4328     trailing_ones= coeff_token&3;
4329     tprintf(h->s.avctx, "trailing:%d, total:%d\n", trailing_ones, total_coeff);
4330     assert(total_coeff<=16);
4331
4332     for(i=0; i<trailing_ones; i++){
4333         level[i]= 1 - 2*get_bits1(gb);
4334     }
4335
4336     if(i<total_coeff) {
4337         int level_code, mask;
4338         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4339         int prefix= get_level_prefix(gb);
4340
4341         //first coefficient has suffix_length equal to 0 or 1
4342         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4343             if(suffix_length)
4344                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4345             else
4346                 level_code= (prefix<<suffix_length); //part
4347         }else if(prefix==14){
4348             if(suffix_length)
4349                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4350             else
4351                 level_code= prefix + get_bits(gb, 4); //part
4352         }else if(prefix==15){
4353             level_code= (prefix<<suffix_length) + get_bits(gb, 12); //part
4354             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4355         }else{
4356             av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
4357             return -1;
4358         }
4359
4360         if(trailing_ones < 3) level_code += 2;
4361
4362         suffix_length = 1;
4363         if(level_code > 5)
4364             suffix_length++;
4365         mask= -(level_code&1);
4366         level[i]= (((2+level_code)>>1) ^ mask) - mask;
4367         i++;
4368
4369         //remaining coefficients have suffix_length > 0
4370         for(;i<total_coeff;i++) {
4371             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
4372             prefix = get_level_prefix(gb);
4373             if(prefix<15){
4374                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4375             }else if(prefix==15){
4376                 level_code =  (prefix<<suffix_length) + get_bits(gb, 12);
4377             }else{
4378                 av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
4379                 return -1;
4380             }
4381             mask= -(level_code&1);
4382             level[i]= (((2+level_code)>>1) ^ mask) - mask;
4383             if(level_code > suffix_limit[suffix_length])
4384                 suffix_length++;
4385         }
4386     }
4387
4388     if(total_coeff == max_coeff)
4389         zeros_left=0;
4390     else{
4391         if(n == CHROMA_DC_BLOCK_INDEX)
4392             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4393         else
4394             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4395     }
4396
4397     coeff_num = zeros_left + total_coeff - 1;
4398     j = scantable[coeff_num];
4399     if(n > 24){
4400         block[j] = level[0];
4401         for(i=1;i<total_coeff;i++) {
4402             if(zeros_left <= 0)
4403                 run_before = 0;
4404             else if(zeros_left < 7){
4405                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4406             }else{
4407                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4408             }
4409             zeros_left -= run_before;
4410             coeff_num -= 1 + run_before;
4411             j= scantable[ coeff_num ];
4412
4413             block[j]= level[i];
4414         }
4415     }else{
4416         block[j] = (level[0] * qmul[j] + 32)>>6;
4417         for(i=1;i<total_coeff;i++) {
4418             if(zeros_left <= 0)
4419                 run_before = 0;
4420             else if(zeros_left < 7){
4421                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4422             }else{
4423                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4424             }
4425             zeros_left -= run_before;
4426             coeff_num -= 1 + run_before;
4427             j= scantable[ coeff_num ];
4428
4429             block[j]= (level[i] * qmul[j] + 32)>>6;
4430         }
4431     }
4432
4433     if(zeros_left<0){
4434         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4435         return -1;
4436     }
4437
4438     return 0;
4439 }
4440
4441 static void predict_field_decoding_flag(H264Context *h){
4442     MpegEncContext * const s = &h->s;
4443     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4444     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
4445                 ? s->current_picture.mb_type[mb_xy-1]
4446                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
4447                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
4448                 : 0;
4449     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
4450 }
4451
4452 /**
4453  * decodes a P_SKIP or B_SKIP macroblock
4454  */
4455 static void decode_mb_skip(H264Context *h){
4456     MpegEncContext * const s = &h->s;
4457     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4458     int mb_type=0;
4459
4460     memset(h->non_zero_count[mb_xy], 0, 16);
4461     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4462
4463     if(MB_FIELD)
4464         mb_type|= MB_TYPE_INTERLACED;
4465
4466     if( h->slice_type == FF_B_TYPE )
4467     {
4468         // just for fill_caches. pred_direct_motion will set the real mb_type
4469         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4470
4471         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4472         pred_direct_motion(h, &mb_type);
4473         mb_type|= MB_TYPE_SKIP;
4474     }
4475     else
4476     {
4477         int mx, my;
4478         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4479
4480         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4481         pred_pskip_motion(h, &mx, &my);
4482         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4483         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4484     }
4485
4486     write_back_motion(h, mb_type);
4487     s->current_picture.mb_type[mb_xy]= mb_type;
4488     s->current_picture.qscale_table[mb_xy]= s->qscale;
4489     h->slice_table[ mb_xy ]= h->slice_num;
4490     h->prev_mb_skipped= 1;
4491 }
4492
4493 /**
4494  * decodes a macroblock
4495  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4496  */
4497 static int decode_mb_cavlc(H264Context *h){
4498     MpegEncContext * const s = &h->s;
4499     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4500     int partition_count;
4501     unsigned int mb_type, cbp;
4502     int dct8x8_allowed= h->pps.transform_8x8_mode;
4503
4504     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
4505
4506     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4507     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4508                 down the code */
4509     if(h->slice_type != FF_I_TYPE && h->slice_type != FF_SI_TYPE){
4510         if(s->mb_skip_run==-1)
4511             s->mb_skip_run= get_ue_golomb(&s->gb);
4512
4513         if (s->mb_skip_run--) {
4514             if(FRAME_MBAFF && (s->mb_y&1) == 0){
4515                 if(s->mb_skip_run==0)
4516                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4517                 else
4518                     predict_field_decoding_flag(h);
4519             }
4520             decode_mb_skip(h);
4521             return 0;
4522         }
4523     }
4524     if(FRAME_MBAFF){
4525         if( (s->mb_y&1) == 0 )
4526             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
4527     }else
4528         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
4529
4530     h->prev_mb_skipped= 0;
4531
4532     mb_type= get_ue_golomb(&s->gb);
4533     if(h->slice_type == FF_B_TYPE){
4534         if(mb_type < 23){
4535             partition_count= b_mb_type_info[mb_type].partition_count;
4536             mb_type=         b_mb_type_info[mb_type].type;
4537         }else{
4538             mb_type -= 23;
4539             goto decode_intra_mb;
4540         }
4541     }else if(h->slice_type == FF_P_TYPE /*|| h->slice_type == FF_SP_TYPE */){
4542         if(mb_type < 5){
4543             partition_count= p_mb_type_info[mb_type].partition_count;
4544             mb_type=         p_mb_type_info[mb_type].type;
4545         }else{
4546             mb_type -= 5;
4547             goto decode_intra_mb;
4548         }
4549     }else{
4550        assert(h->slice_type == FF_I_TYPE);
4551 decode_intra_mb:
4552         if(mb_type > 25){
4553             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice too large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4554             return -1;
4555         }
4556         partition_count=0;
4557         cbp= i_mb_type_info[mb_type].cbp;
4558         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4559         mb_type= i_mb_type_info[mb_type].type;
4560     }
4561
4562     if(MB_FIELD)
4563         mb_type |= MB_TYPE_INTERLACED;
4564
4565     h->slice_table[ mb_xy ]= h->slice_num;
4566
4567     if(IS_INTRA_PCM(mb_type)){
4568         unsigned int x, y;
4569
4570         // We assume these blocks are very rare so we do not optimize it.
4571         align_get_bits(&s->gb);
4572
4573         // The pixels are stored in the same order as levels in h->mb array.
4574         for(y=0; y<16; y++){
4575             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
4576             for(x=0; x<16; x++){
4577                 tprintf(s->avctx, "LUMA ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4578                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= get_bits(&s->gb, 8);
4579             }
4580         }
4581         for(y=0; y<8; y++){
4582             const int index= 256 + 4*(y&3) + 32*(y>>2);
4583             for(x=0; x<8; x++){
4584                 tprintf(s->avctx, "CHROMA U ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4585                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4586             }
4587         }
4588         for(y=0; y<8; y++){
4589             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
4590             for(x=0; x<8; x++){
4591                 tprintf(s->avctx, "CHROMA V ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4592                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4593             }
4594         }
4595
4596         // In deblocking, the quantizer is 0
4597         s->current_picture.qscale_table[mb_xy]= 0;
4598         h->chroma_qp[0] = get_chroma_qp(h, 0, 0);
4599         h->chroma_qp[1] = get_chroma_qp(h, 1, 0);
4600         // All coeffs are present
4601         memset(h->non_zero_count[mb_xy], 16, 16);
4602
4603         s->current_picture.mb_type[mb_xy]= mb_type;
4604         return 0;
4605     }
4606
4607     if(MB_MBAFF){
4608         h->ref_count[0] <<= 1;
4609         h->ref_count[1] <<= 1;
4610     }
4611
4612     fill_caches(h, mb_type, 0);
4613
4614     //mb_pred
4615     if(IS_INTRA(mb_type)){
4616             int pred_mode;
4617 //            init_top_left_availability(h);
4618             if(IS_INTRA4x4(mb_type)){
4619                 int i;
4620                 int di = 1;
4621                 if(dct8x8_allowed && get_bits1(&s->gb)){
4622                     mb_type |= MB_TYPE_8x8DCT;
4623                     di = 4;
4624                 }
4625
4626 //                fill_intra4x4_pred_table(h);
4627                 for(i=0; i<16; i+=di){
4628                     int mode= pred_intra_mode(h, i);
4629
4630                     if(!get_bits1(&s->gb)){
4631                         const int rem_mode= get_bits(&s->gb, 3);
4632                         mode = rem_mode + (rem_mode >= mode);
4633                     }
4634
4635                     if(di==4)
4636                         fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4637                     else
4638                         h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4639                 }
4640                 write_back_intra_pred_mode(h);
4641                 if( check_intra4x4_pred_mode(h) < 0)
4642                     return -1;
4643             }else{
4644                 h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4645                 if(h->intra16x16_pred_mode < 0)
4646                     return -1;
4647             }
4648
4649             pred_mode= check_intra_pred_mode(h, get_ue_golomb(&s->gb));
4650             if(pred_mode < 0)
4651                 return -1;
4652             h->chroma_pred_mode= pred_mode;
4653     }else if(partition_count==4){
4654         int i, j, sub_partition_count[4], list, ref[2][4];
4655
4656         if(h->slice_type == FF_B_TYPE){
4657             for(i=0; i<4; i++){
4658                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4659                 if(h->sub_mb_type[i] >=13){
4660                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4661                     return -1;
4662                 }
4663                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4664                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4665             }
4666             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4667                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4668                 pred_direct_motion(h, &mb_type);
4669                 h->ref_cache[0][scan8[4]] =
4670                 h->ref_cache[1][scan8[4]] =
4671                 h->ref_cache[0][scan8[12]] =
4672                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4673             }
4674         }else{
4675             assert(h->slice_type == FF_P_TYPE || h->slice_type == FF_SP_TYPE); //FIXME SP correct ?
4676             for(i=0; i<4; i++){
4677                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4678                 if(h->sub_mb_type[i] >=4){
4679                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %u out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4680                     return -1;
4681                 }
4682                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4683                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4684             }
4685         }
4686
4687         for(list=0; list<h->list_count; list++){
4688             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4689             for(i=0; i<4; i++){
4690                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4691                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4692                     unsigned int tmp = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
4693                     if(tmp>=ref_count){
4694                         av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", tmp);
4695                         return -1;
4696                     }
4697                     ref[list][i]= tmp;
4698                 }else{
4699                  //FIXME
4700                     ref[list][i] = -1;
4701                 }
4702             }
4703         }
4704
4705         if(dct8x8_allowed)
4706             dct8x8_allowed = get_dct8x8_allowed(h);
4707
4708         for(list=0; list<h->list_count; list++){
4709             for(i=0; i<4; i++){
4710                 if(IS_DIRECT(h->sub_mb_type[i])) {
4711                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4712                     continue;
4713                 }
4714                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4715                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4716
4717                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4718                     const int sub_mb_type= h->sub_mb_type[i];
4719                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4720                     for(j=0; j<sub_partition_count[i]; j++){
4721                         int mx, my;
4722                         const int index= 4*i + block_width*j;
4723                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4724                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4725                         mx += get_se_golomb(&s->gb);
4726                         my += get_se_golomb(&s->gb);
4727                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4728
4729                         if(IS_SUB_8X8(sub_mb_type)){
4730                             mv_cache[ 1 ][0]=
4731                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4732                             mv_cache[ 1 ][1]=
4733                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4734                         }else if(IS_SUB_8X4(sub_mb_type)){
4735                             mv_cache[ 1 ][0]= mx;
4736                             mv_cache[ 1 ][1]= my;
4737                         }else if(IS_SUB_4X8(sub_mb_type)){
4738                             mv_cache[ 8 ][0]= mx;
4739                             mv_cache[ 8 ][1]= my;
4740                         }
4741                         mv_cache[ 0 ][0]= mx;
4742                         mv_cache[ 0 ][1]= my;
4743                     }
4744                 }else{
4745                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4746                     p[0] = p[1]=
4747                     p[8] = p[9]= 0;
4748                 }
4749             }
4750         }
4751     }else if(IS_DIRECT(mb_type)){
4752         pred_direct_motion(h, &mb_type);
4753         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
4754     }else{
4755         int list, mx, my, i;
4756          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4757         if(IS_16X16(mb_type)){
4758             for(list=0; list<h->list_count; list++){
4759                     unsigned int val;
4760                     if(IS_DIR(mb_type, 0, list)){
4761                         val= get_te0_golomb(&s->gb, h->ref_count[list]);
4762                         if(val >= h->ref_count[list]){
4763                             av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4764                             return -1;
4765                         }
4766                     }else
4767                         val= LIST_NOT_USED&0xFF;
4768                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4769             }
4770             for(list=0; list<h->list_count; list++){
4771                 unsigned int val;
4772                 if(IS_DIR(mb_type, 0, list)){
4773                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4774                     mx += get_se_golomb(&s->gb);
4775                     my += get_se_golomb(&s->gb);
4776                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4777
4778                     val= pack16to32(mx,my);
4779                 }else
4780                     val=0;
4781                 fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, val, 4);
4782             }
4783         }
4784         else if(IS_16X8(mb_type)){
4785             for(list=0; list<h->list_count; list++){
4786                     for(i=0; i<2; i++){
4787                         unsigned int val;
4788                         if(IS_DIR(mb_type, i, list)){
4789                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4790                             if(val >= h->ref_count[list]){
4791                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4792                                 return -1;
4793                             }
4794                         }else
4795                             val= LIST_NOT_USED&0xFF;
4796                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4797                     }
4798             }
4799             for(list=0; list<h->list_count; list++){
4800                 for(i=0; i<2; i++){
4801                     unsigned int val;
4802                     if(IS_DIR(mb_type, i, list)){
4803                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4804                         mx += get_se_golomb(&s->gb);
4805                         my += get_se_golomb(&s->gb);
4806                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4807
4808                         val= pack16to32(mx,my);
4809                     }else
4810                         val=0;
4811                     fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 4);
4812                 }
4813             }
4814         }else{
4815             assert(IS_8X16(mb_type));
4816             for(list=0; list<h->list_count; list++){
4817                     for(i=0; i<2; i++){
4818                         unsigned int val;
4819                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4820                             val= get_te0_golomb(&s->gb, h->ref_count[list]);
4821                             if(val >= h->ref_count[list]){
4822                                 av_log(h->s.avctx, AV_LOG_ERROR, "ref %u overflow\n", val);
4823                                 return -1;
4824                             }
4825                         }else
4826                             val= LIST_NOT_USED&0xFF;
4827                         fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4828                     }
4829             }
4830             for(list=0; list<h->list_count; list++){
4831                 for(i=0; i<2; i++){
4832                     unsigned int val;
4833                     if(IS_DIR(mb_type, i, list)){
4834                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4835                         mx += get_se_golomb(&s->gb);
4836                         my += get_se_golomb(&s->gb);
4837                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
4838
4839                         val= pack16to32(mx,my);
4840                     }else
4841                         val=0;
4842                     fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 4);
4843                 }
4844             }
4845         }
4846     }
4847
4848     if(IS_INTER(mb_type))
4849         write_back_motion(h, mb_type);
4850
4851     if(!IS_INTRA16x16(mb_type)){
4852         cbp= get_ue_golomb(&s->gb);
4853         if(cbp > 47){
4854             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%u) at %d %d\n", cbp, s->mb_x, s->mb_y);
4855             return -1;
4856         }
4857
4858         if(IS_INTRA4x4(mb_type))
4859             cbp= golomb_to_intra4x4_cbp[cbp];
4860         else
4861             cbp= golomb_to_inter_cbp[cbp];
4862     }
4863     h->cbp = cbp;
4864
4865     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
4866         if(get_bits1(&s->gb))
4867             mb_type |= MB_TYPE_8x8DCT;
4868     }
4869     s->current_picture.mb_type[mb_xy]= mb_type;
4870
4871     if(cbp || IS_INTRA16x16(mb_type)){
4872         int i8x8, i4x4, chroma_idx;
4873         int dquant;
4874         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4875         const uint8_t *scan, *scan8x8, *dc_scan;
4876
4877 //        fill_non_zero_count_cache(h);
4878
4879         if(IS_INTERLACED(mb_type)){
4880             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
4881             scan= s->qscale ? h->field_scan : h->field_scan_q0;
4882             dc_scan= luma_dc_field_scan;
4883         }else{
4884             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
4885             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
4886             dc_scan= luma_dc_zigzag_scan;
4887         }
4888
4889         dquant= get_se_golomb(&s->gb);
4890
4891         if( dquant > 25 || dquant < -26 ){
4892             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4893             return -1;
4894         }
4895
4896         s->qscale += dquant;
4897         if(((unsigned)s->qscale) > 51){
4898             if(s->qscale<0) s->qscale+= 52;
4899             else            s->qscale-= 52;
4900         }
4901
4902         h->chroma_qp[0]= get_chroma_qp(h, 0, s->qscale);
4903         h->chroma_qp[1]= get_chroma_qp(h, 1, s->qscale);
4904         if(IS_INTRA16x16(mb_type)){
4905             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
4906                 return -1; //FIXME continue if partitioned and other return -1 too
4907             }
4908
4909             assert((cbp&15) == 0 || (cbp&15) == 15);
4910
4911             if(cbp&15){
4912                 for(i8x8=0; i8x8<4; i8x8++){
4913                     for(i4x4=0; i4x4<4; i4x4++){
4914                         const int index= i4x4 + 4*i8x8;
4915                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
4916                             return -1;
4917                         }
4918                     }
4919                 }
4920             }else{
4921                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4922             }
4923         }else{
4924             for(i8x8=0; i8x8<4; i8x8++){
4925                 if(cbp & (1<<i8x8)){
4926                     if(IS_8x8DCT(mb_type)){
4927                         DCTELEM *buf = &h->mb[64*i8x8];
4928                         uint8_t *nnz;
4929                         for(i4x4=0; i4x4<4; i4x4++){
4930                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
4931                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
4932                                 return -1;
4933                         }
4934                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4935                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
4936                     }else{
4937                         for(i4x4=0; i4x4<4; i4x4++){
4938                             const int index= i4x4 + 4*i8x8;
4939
4940                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
4941                                 return -1;
4942                             }
4943                         }
4944                     }
4945                 }else{
4946                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4947                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4948                 }
4949             }
4950         }
4951
4952         if(cbp&0x30){
4953             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4954                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
4955                     return -1;
4956                 }
4957         }
4958
4959         if(cbp&0x20){
4960             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4961                 const uint32_t *qmul = h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[chroma_idx]];
4962                 for(i4x4=0; i4x4<4; i4x4++){
4963                     const int index= 16 + 4*chroma_idx + i4x4;
4964                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, qmul, 15) < 0){
4965                         return -1;
4966                     }
4967                 }
4968             }
4969         }else{
4970             uint8_t * const nnz= &h->non_zero_count_cache[0];
4971             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4972             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4973         }
4974     }else{
4975         uint8_t * const nnz= &h->non_zero_count_cache[0];
4976         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4977         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4978         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4979     }
4980     s->current_picture.qscale_table[mb_xy]= s->qscale;
4981     write_back_non_zero_count(h);
4982
4983     if(MB_MBAFF){
4984         h->ref_count[0] >>= 1;
4985         h->ref_count[1] >>= 1;
4986     }
4987
4988     return 0;
4989 }
4990
4991 static int decode_cabac_field_decoding_flag(H264Context *h) {
4992     MpegEncContext * const s = &h->s;
4993     const int mb_x = s->mb_x;
4994     const int mb_y = s->mb_y & ~1;
4995     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4996     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4997
4998     unsigned int ctx = 0;
4999
5000     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
5001         ctx += 1;
5002     }
5003     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
5004         ctx += 1;
5005     }
5006
5007     return get_cabac_noinline( &h->cabac, &h->cabac_state[70 + ctx] );
5008 }
5009
5010 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
5011     uint8_t *state= &h->cabac_state[ctx_base];
5012     int mb_type;
5013
5014     if(intra_slice){
5015         MpegEncContext * const s = &h->s;
5016         const int mba_xy = h->left_mb_xy[0];
5017         const int mbb_xy = h->top_mb_xy;
5018         int ctx=0;
5019         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
5020             ctx++;
5021         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
5022             ctx++;
5023         if( get_cabac_noinline( &h->cabac, &state[ctx] ) == 0 )
5024             return 0;   /* I4x4 */
5025         state += 2;
5026     }else{
5027         if( get_cabac_noinline( &h->cabac, &state[0] ) == 0 )
5028             return 0;   /* I4x4 */
5029     }
5030
5031     if( get_cabac_terminate( &h->cabac ) )
5032         return 25;  /* PCM */
5033
5034     mb_type = 1; /* I16x16 */
5035     mb_type += 12 * get_cabac_noinline( &h->cabac, &state[1] ); /* cbp_luma != 0 */
5036     if( get_cabac_noinline( &h->cabac, &state[2] ) ) /* cbp_chroma */
5037         mb_type += 4 + 4 * get_cabac_noinline( &h->cabac, &state[2+intra_slice] );
5038     mb_type += 2 * get_cabac_noinline( &h->cabac, &state[3+intra_slice] );
5039     mb_type += 1 * get_cabac_noinline( &h->cabac, &state[3+2*intra_slice] );
5040     return mb_type;
5041 }
5042
5043 static int decode_cabac_mb_type( H264Context *h ) {
5044     MpegEncContext * const s = &h->s;
5045
5046     if( h->slice_type == FF_I_TYPE ) {
5047         return decode_cabac_intra_mb_type(h, 3, 1);
5048     } else if( h->slice_type == FF_P_TYPE ) {
5049         if( get_cabac_noinline( &h->cabac, &h->cabac_state[14] ) == 0 ) {
5050             /* P-type */
5051             if( get_cabac_noinline( &h->cabac, &h->cabac_state[15] ) == 0 ) {
5052                 /* P_L0_D16x16, P_8x8 */
5053                 return 3 * get_cabac_noinline( &h->cabac, &h->cabac_state[16] );
5054             } else {
5055                 /* P_L0_D8x16, P_L0_D16x8 */
5056                 return 2 - get_cabac_noinline( &h->cabac, &h->cabac_state[17] );
5057             }
5058         } else {
5059             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
5060         }
5061     } else if( h->slice_type == FF_B_TYPE ) {
5062         const int mba_xy = h->left_mb_xy[0];
5063         const int mbb_xy = h->top_mb_xy;
5064         int ctx = 0;
5065         int bits;
5066
5067         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
5068             ctx++;
5069         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
5070             ctx++;
5071
5072         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+ctx] ) )
5073             return 0; /* B_Direct_16x16 */
5074
5075         if( !get_cabac_noinline( &h->cabac, &h->cabac_state[27+3] ) ) {
5076             return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
5077         }
5078
5079         bits = get_cabac_noinline( &h->cabac, &h->cabac_state[27+4] ) << 3;
5080         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 2;
5081         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] ) << 1;
5082         bits|= get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5083         if( bits < 8 )
5084             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
5085         else if( bits == 13 ) {
5086             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
5087         } else if( bits == 14 )
5088             return 11; /* B_L1_L0_8x16 */
5089         else if( bits == 15 )
5090             return 22; /* B_8x8 */
5091
5092         bits= ( bits<<1 ) | get_cabac_noinline( &h->cabac, &h->cabac_state[27+5] );
5093         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
5094     } else {
5095         /* TODO SI/SP frames? */
5096         return -1;
5097     }
5098 }
5099
5100 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
5101     MpegEncContext * const s = &h->s;
5102     int mba_xy, mbb_xy;
5103     int ctx = 0;
5104
5105     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
5106         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
5107         mba_xy = mb_xy - 1;
5108         if( (mb_y&1)
5109             && h->slice_table[mba_xy] == h->slice_num
5110             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
5111             mba_xy += s->mb_stride;
5112         if( MB_FIELD ){
5113             mbb_xy = mb_xy - s->mb_stride;
5114             if( !(mb_y&1)
5115                 && h->slice_table[mbb_xy] == h->slice_num
5116                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
5117                 mbb_xy -= s->mb_stride;
5118         }else
5119             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
5120     }else{
5121         int mb_xy = mb_x + mb_y*s->mb_stride;
5122         mba_xy = mb_xy - 1;
5123         mbb_xy = mb_xy - (s->mb_stride << FIELD_PICTURE);
5124     }
5125
5126     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
5127         ctx++;
5128     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
5129         ctx++;
5130
5131     if( h->slice_type == FF_B_TYPE )
5132         ctx += 13;
5133     return get_cabac_noinline( &h->cabac, &h->cabac_state[11+ctx] );
5134 }
5135
5136 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
5137     int mode = 0;
5138
5139     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
5140         return pred_mode;
5141
5142     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
5143     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
5144     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
5145
5146     if( mode >= pred_mode )
5147         return mode + 1;
5148     else
5149         return mode;
5150 }
5151
5152 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
5153     const int mba_xy = h->left_mb_xy[0];
5154     const int mbb_xy = h->top_mb_xy;
5155
5156     int ctx = 0;
5157
5158     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
5159     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
5160         ctx++;
5161
5162     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
5163         ctx++;
5164
5165     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
5166         return 0;
5167
5168     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5169         return 1;
5170     if( get_cabac_noinline( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5171         return 2;
5172     else
5173         return 3;
5174 }
5175
5176 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5177     int cbp_b, cbp_a, ctx, cbp = 0;
5178
5179     cbp_a = h->slice_table[h->left_mb_xy[0]] == h->slice_num ? h->left_cbp : -1;
5180     cbp_b = h->slice_table[h->top_mb_xy]     == h->slice_num ? h->top_cbp  : -1;
5181
5182     ctx = !(cbp_a & 0x02) + 2 * !(cbp_b & 0x04);
5183     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]);
5184     ctx = !(cbp   & 0x01) + 2 * !(cbp_b & 0x08);
5185     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 1;
5186     ctx = !(cbp_a & 0x08) + 2 * !(cbp   & 0x01);
5187     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 2;
5188     ctx = !(cbp   & 0x04) + 2 * !(cbp   & 0x02);
5189     cbp |= get_cabac_noinline(&h->cabac, &h->cabac_state[73 + ctx]) << 3;
5190     return cbp;
5191 }
5192 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5193     int ctx;
5194     int cbp_a, cbp_b;
5195
5196     cbp_a = (h->left_cbp>>4)&0x03;
5197     cbp_b = (h-> top_cbp>>4)&0x03;
5198
5199     ctx = 0;
5200     if( cbp_a > 0 ) ctx++;
5201     if( cbp_b > 0 ) ctx += 2;
5202     if( get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5203         return 0;
5204
5205     ctx = 4;
5206     if( cbp_a == 2 ) ctx++;
5207     if( cbp_b == 2 ) ctx += 2;
5208     return 1 + get_cabac_noinline( &h->cabac, &h->cabac_state[77 + ctx] );
5209 }
5210 static int decode_cabac_mb_dqp( H264Context *h) {
5211     int   ctx = 0;
5212     int   val = 0;
5213
5214     if( h->last_qscale_diff != 0 )
5215         ctx++;
5216
5217     while( get_cabac_noinline( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5218         if( ctx < 2 )
5219             ctx = 2;
5220         else
5221             ctx = 3;
5222         val++;
5223         if(val > 102) //prevent infinite loop
5224             return INT_MIN;
5225     }
5226
5227     if( val&0x01 )
5228         return (val + 1)/2;
5229     else
5230         return -(val + 1)/2;
5231 }
5232 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5233     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5234         return 0;   /* 8x8 */
5235     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5236         return 1;   /* 8x4 */
5237     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5238         return 2;   /* 4x8 */
5239     return 3;       /* 4x4 */
5240 }
5241 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5242     int type;
5243     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5244         return 0;   /* B_Direct_8x8 */
5245     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5246         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5247     type = 3;
5248     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5249         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5250             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5251         type += 4;
5252     }
5253     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5254     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5255     return type;
5256 }
5257
5258 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5259     return get_cabac_noinline( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5260 }
5261
5262 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5263     int refa = h->ref_cache[list][scan8[n] - 1];
5264     int refb = h->ref_cache[list][scan8[n] - 8];
5265     int ref  = 0;
5266     int ctx  = 0;
5267
5268     if( h->slice_type == FF_B_TYPE) {
5269         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5270             ctx++;
5271         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5272             ctx += 2;
5273     } else {
5274         if( refa > 0 )
5275             ctx++;
5276         if( refb > 0 )
5277             ctx += 2;
5278     }
5279
5280     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5281         ref++;
5282         if( ctx < 4 )
5283             ctx = 4;
5284         else
5285             ctx = 5;
5286         if(ref >= 32 /*h->ref_list[list]*/){
5287             av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_ref\n");
5288             return 0; //FIXME we should return -1 and check the return everywhere
5289         }
5290     }
5291     return ref;
5292 }
5293
5294 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5295     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5296                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5297     int ctxbase = (l == 0) ? 40 : 47;
5298     int ctx, mvd;
5299
5300     if( amvd < 3 )
5301         ctx = 0;
5302     else if( amvd > 32 )
5303         ctx = 2;
5304     else
5305         ctx = 1;
5306
5307     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5308         return 0;
5309
5310     mvd= 1;
5311     ctx= 3;
5312     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5313         mvd++;
5314         if( ctx < 6 )
5315             ctx++;
5316     }
5317
5318     if( mvd >= 9 ) {
5319         int k = 3;
5320         while( get_cabac_bypass( &h->cabac ) ) {
5321             mvd += 1 << k;
5322             k++;
5323             if(k>24){
5324                 av_log(h->s.avctx, AV_LOG_ERROR, "overflow in decode_cabac_mb_mvd\n");
5325                 return INT_MIN;
5326             }
5327         }
5328         while( k-- ) {
5329             if( get_cabac_bypass( &h->cabac ) )
5330                 mvd += 1 << k;
5331         }
5332     }
5333     return get_cabac_bypass_sign( &h->cabac, -mvd );
5334 }
5335
5336 static inline int get_cabac_cbf_ctx( H264Context *h, int cat, int idx ) {
5337     int nza, nzb;
5338     int ctx = 0;
5339
5340     if( cat == 0 ) {
5341         nza = h->left_cbp&0x100;
5342         nzb = h-> top_cbp&0x100;
5343     } else if( cat == 1 || cat == 2 ) {
5344         nza = h->non_zero_count_cache[scan8[idx] - 1];
5345         nzb = h->non_zero_count_cache[scan8[idx] - 8];
5346     } else if( cat == 3 ) {
5347         nza = (h->left_cbp>>(6+idx))&0x01;
5348         nzb = (h-> top_cbp>>(6+idx))&0x01;
5349     } else {
5350         assert(cat == 4);
5351         nza = h->non_zero_count_cache[scan8[16+idx] - 1];
5352         nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
5353     }
5354
5355     if( nza > 0 )
5356         ctx++;
5357
5358     if( nzb > 0 )
5359         ctx += 2;
5360
5361     return ctx + 4 * cat;
5362 }
5363
5364 DECLARE_ASM_CONST(1, const uint8_t, last_coeff_flag_offset_8x8[63]) = {
5365     0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5366     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5367     3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5368     5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5369 };
5370
5371 static void decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff) {
5372     const int mb_xy  = h->s.mb_x + h->s.mb_y*h->s.mb_stride;
5373     static const int significant_coeff_flag_offset[2][6] = {
5374       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
5375       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
5376     };
5377     static const int last_coeff_flag_offset[2][6] = {
5378       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
5379       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
5380     };
5381     static const int coeff_abs_level_m1_offset[6] = {
5382         227+0, 227+10, 227+20, 227+30, 227+39, 426
5383     };
5384     static const uint8_t significant_coeff_flag_offset_8x8[2][63] = {
5385       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5386         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5387         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5388        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
5389       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
5390         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
5391         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
5392         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
5393     };
5394
5395     int index[64];
5396
5397     int av_unused last;
5398     int coeff_count = 0;
5399
5400     int abslevel1 = 1;
5401     int abslevelgt1 = 0;
5402
5403     uint8_t *significant_coeff_ctx_base;
5404     uint8_t *last_coeff_ctx_base;
5405     uint8_t *abs_level_m1_ctx_base;
5406
5407 #ifndef ARCH_X86
5408 #define CABAC_ON_STACK
5409 #endif
5410 #ifdef CABAC_ON_STACK
5411 #define CC &cc
5412     CABACContext cc;
5413     cc.range     = h->cabac.range;
5414     cc.low       = h->cabac.low;
5415     cc.bytestream= h->cabac.bytestream;
5416 #else
5417 #define CC &h->cabac
5418 #endif
5419
5420
5421     /* cat: 0-> DC 16x16  n = 0
5422      *      1-> AC 16x16  n = luma4x4idx
5423      *      2-> Luma4x4   n = luma4x4idx
5424      *      3-> DC Chroma n = iCbCr
5425      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
5426      *      5-> Luma8x8   n = 4 * luma8x8idx
5427      */
5428
5429     /* read coded block flag */
5430     if( cat != 5 ) {
5431         if( get_cabac( CC, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n ) ] ) == 0 ) {
5432             if( cat == 1 || cat == 2 )
5433                 h->non_zero_count_cache[scan8[n]] = 0;
5434             else if( cat == 4 )
5435                 h->non_zero_count_cache[scan8[16+n]] = 0;
5436 #ifdef CABAC_ON_STACK
5437             h->cabac.range     = cc.range     ;
5438             h->cabac.low       = cc.low       ;
5439             h->cabac.bytestream= cc.bytestream;
5440 #endif
5441             return;
5442         }
5443     }
5444
5445     significant_coeff_ctx_base = h->cabac_state
5446         + significant_coeff_flag_offset[MB_FIELD][cat];
5447     last_coeff_ctx_base = h->cabac_state
5448         + last_coeff_flag_offset[MB_FIELD][cat];
5449     abs_level_m1_ctx_base = h->cabac_state
5450         + coeff_abs_level_m1_offset[cat];
5451
5452     if( cat == 5 ) {
5453 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5454         for(last= 0; last < coefs; last++) { \
5455             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5456             if( get_cabac( CC, sig_ctx )) { \
5457                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5458                 index[coeff_count++] = last; \
5459                 if( get_cabac( CC, last_ctx ) ) { \
5460                     last= max_coeff; \
5461                     break; \
5462                 } \
5463             } \
5464         }\
5465         if( last == max_coeff -1 ) {\
5466             index[coeff_count++] = last;\
5467         }
5468         const uint8_t *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
5469 #if defined(ARCH_X86) && defined(HAVE_7REGS) && defined(HAVE_EBX_AVAILABLE) && !defined(BROKEN_RELOCATIONS)
5470         coeff_count= decode_significance_8x8_x86(CC, significant_coeff_ctx_base, index, sig_off);
5471     } else {
5472         coeff_count= decode_significance_x86(CC, max_coeff, significant_coeff_ctx_base, index);
5473 #else
5474         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
5475     } else {
5476         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5477 #endif
5478     }
5479     assert(coeff_count > 0);
5480
5481     if( cat == 0 )
5482         h->cbp_table[mb_xy] |= 0x100;
5483     else if( cat == 1 || cat == 2 )
5484         h->non_zero_count_cache[scan8[n]] = coeff_count;
5485     else if( cat == 3 )
5486         h->cbp_table[mb_xy] |= 0x40 << n;
5487     else if( cat == 4 )
5488         h->non_zero_count_cache[scan8[16+n]] = coeff_count;
5489     else {
5490         assert( cat == 5 );
5491         fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5492     }
5493
5494     for( coeff_count--; coeff_count >= 0; coeff_count-- ) {
5495         uint8_t *ctx = (abslevelgt1 != 0 ? 0 : FFMIN( 4, abslevel1 )) + abs_level_m1_ctx_base;
5496         int j= scantable[index[coeff_count]];
5497
5498         if( get_cabac( CC, ctx ) == 0 ) {
5499             if( !qmul ) {
5500                 block[j] = get_cabac_bypass_sign( CC, -1);
5501             }else{
5502                 block[j] = (get_cabac_bypass_sign( CC, -qmul[j]) + 32) >> 6;
5503             }
5504
5505             abslevel1++;
5506         } else {
5507             int coeff_abs = 2;
5508             ctx = 5 + FFMIN( 4, abslevelgt1 ) + abs_level_m1_ctx_base;
5509             while( coeff_abs < 15 && get_cabac( CC, ctx ) ) {
5510                 coeff_abs++;
5511             }
5512
5513             if( coeff_abs >= 15 ) {
5514                 int j = 0;
5515                 while( get_cabac_bypass( CC ) ) {
5516                     j++;
5517                 }
5518
5519                 coeff_abs=1;
5520                 while( j-- ) {
5521                     coeff_abs += coeff_abs + get_cabac_bypass( CC );
5522                 }
5523                 coeff_abs+= 14;
5524             }
5525
5526             if( !qmul ) {
5527                 if( get_cabac_bypass( CC ) ) block[j] = -coeff_abs;
5528                 else                                block[j] =  coeff_abs;
5529             }else{
5530                 if( get_cabac_bypass( CC ) ) block[j] = (-coeff_abs * qmul[j] + 32) >> 6;
5531                 else                                block[j] = ( coeff_abs * qmul[j] + 32) >> 6;
5532             }
5533
5534             abslevelgt1++;
5535         }
5536     }
5537 #ifdef CABAC_ON_STACK
5538             h->cabac.range     = cc.range     ;
5539             h->cabac.low       = cc.low       ;
5540             h->cabac.bytestream= cc.bytestream;
5541 #endif
5542
5543 }
5544
5545 static inline void compute_mb_neighbors(H264Context *h)
5546 {
5547     MpegEncContext * const s = &h->s;
5548     const int mb_xy  = s->mb_x + s->mb_y*s->mb_stride;
5549     h->top_mb_xy     = mb_xy - s->mb_stride;
5550     h->left_mb_xy[0] = mb_xy - 1;
5551     if(FRAME_MBAFF){
5552         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5553         const int top_pair_xy      = pair_xy     - s->mb_stride;
5554         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5555         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5556         const int curr_mb_frame_flag = !MB_FIELD;
5557         const int bottom = (s->mb_y & 1);
5558         if (bottom
5559                 ? !curr_mb_frame_flag // bottom macroblock
5560                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
5561                 ) {
5562             h->top_mb_xy -= s->mb_stride;
5563         }
5564         if (left_mb_frame_flag != curr_mb_frame_flag) {
5565             h->left_mb_xy[0] = pair_xy - 1;
5566         }
5567     } else if (FIELD_PICTURE) {
5568         h->top_mb_xy -= s->mb_stride;
5569     }
5570     return;
5571 }
5572
5573 /**
5574  * decodes a macroblock
5575  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5576  */
5577 static int decode_mb_cabac(H264Context *h) {
5578     MpegEncContext * const s = &h->s;
5579     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
5580     int mb_type, partition_count, cbp = 0;
5581     int dct8x8_allowed= h->pps.transform_8x8_mode;
5582
5583     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
5584
5585     tprintf(s->avctx, "pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5586     if( h->slice_type != FF_I_TYPE && h->slice_type != FF_SI_TYPE ) {
5587         int skip;
5588         /* a skipped mb needs the aff flag from the following mb */
5589         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
5590             predict_field_decoding_flag(h);
5591         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
5592             skip = h->next_mb_skipped;
5593         else
5594             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
5595         /* read skip flags */
5596         if( skip ) {
5597             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
5598                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
5599                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
5600                 if(h->next_mb_skipped)
5601                     predict_field_decoding_flag(h);
5602                 else
5603                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5604             }
5605
5606             decode_mb_skip(h);
5607
5608             h->cbp_table[mb_xy] = 0;
5609             h->chroma_pred_mode_table[mb_xy] = 0;
5610             h->last_qscale_diff = 0;
5611
5612             return 0;
5613
5614         }
5615     }
5616     if(FRAME_MBAFF){
5617         if( (s->mb_y&1) == 0 )
5618             h->mb_mbaff =
5619             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5620     }else
5621         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
5622
5623     h->prev_mb_skipped = 0;
5624
5625     compute_mb_neighbors(h);
5626     if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
5627         av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
5628         return -1;
5629     }
5630
5631     if( h->slice_type == FF_B_TYPE ) {
5632         if( mb_type < 23 ){
5633             partition_count= b_mb_type_info[mb_type].partition_count;
5634             mb_type=         b_mb_type_info[mb_type].type;
5635         }else{
5636             mb_type -= 23;
5637             goto decode_intra_mb;
5638         }
5639     } else if( h->slice_type == FF_P_TYPE ) {
5640         if( mb_type < 5) {
5641             partition_count= p_mb_type_info[mb_type].partition_count;
5642             mb_type=         p_mb_type_info[mb_type].type;
5643         } else {
5644             mb_type -= 5;
5645             goto decode_intra_mb;
5646         }
5647     } else {
5648        assert(h->slice_type == FF_I_TYPE);
5649 decode_intra_mb:
5650         partition_count = 0;
5651         cbp= i_mb_type_info[mb_type].cbp;
5652         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5653         mb_type= i_mb_type_info[mb_type].type;
5654     }
5655     if(MB_FIELD)
5656         mb_type |= MB_TYPE_INTERLACED;
5657
5658     h->slice_table[ mb_xy ]= h->slice_num;
5659
5660     if(IS_INTRA_PCM(mb_type)) {
5661         const uint8_t *ptr;
5662         unsigned int x, y;
5663
5664         // We assume these blocks are very rare so we do not optimize it.
5665         // FIXME The two following lines get the bitstream position in the cabac
5666         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5667         ptr= h->cabac.bytestream;
5668         if(h->cabac.low&0x1) ptr--;
5669         if(CABAC_BITS==16){
5670             if(h->cabac.low&0x1FF) ptr--;
5671         }
5672
5673         // The pixels are stored in the same order as levels in h->mb array.
5674         for(y=0; y<16; y++){
5675             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
5676             for(x=0; x<16; x++){
5677                 tprintf(s->avctx, "LUMA ICPM LEVEL (%3d)\n", *ptr);
5678                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= *ptr++;
5679             }
5680         }
5681         for(y=0; y<8; y++){
5682             const int index= 256 + 4*(y&3) + 32*(y>>2);
5683             for(x=0; x<8; x++){
5684                 tprintf(s->avctx, "CHROMA U ICPM LEVEL (%3d)\n", *ptr);
5685                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5686             }
5687         }
5688         for(y=0; y<8; y++){
5689             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
5690             for(x=0; x<8; x++){
5691                 tprintf(s->avctx, "CHROMA V ICPM LEVEL (%3d)\n", *ptr);
5692                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5693             }
5694         }
5695
5696         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5697
5698         // All blocks are present
5699         h->cbp_table[mb_xy] = 0x1ef;
5700         h->chroma_pred_mode_table[mb_xy] = 0;
5701         // In deblocking, the quantizer is 0
5702         s->current_picture.qscale_table[mb_xy]= 0;
5703         h->chroma_qp[0] = get_chroma_qp(h, 0, 0);
5704         h->chroma_qp[1] = get_chroma_qp(h, 1, 0);
5705         // All coeffs are present
5706         memset(h->non_zero_count[mb_xy], 16, 16);
5707         s->current_picture.mb_type[mb_xy]= mb_type;
5708         return 0;
5709     }
5710
5711     if(MB_MBAFF){
5712         h->ref_count[0] <<= 1;
5713         h->ref_count[1] <<= 1;
5714     }
5715
5716     fill_caches(h, mb_type, 0);
5717
5718     if( IS_INTRA( mb_type ) ) {
5719         int i, pred_mode;
5720         if( IS_INTRA4x4( mb_type ) ) {
5721             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5722                 mb_type |= MB_TYPE_8x8DCT;
5723                 for( i = 0; i < 16; i+=4 ) {
5724                     int pred = pred_intra_mode( h, i );
5725                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5726                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5727                 }
5728             } else {
5729                 for( i = 0; i < 16; i++ ) {
5730                     int pred = pred_intra_mode( h, i );
5731                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5732
5733                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5734                 }
5735             }
5736             write_back_intra_pred_mode(h);
5737             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5738         } else {
5739             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5740             if( h->intra16x16_pred_mode < 0 ) return -1;
5741         }
5742         h->chroma_pred_mode_table[mb_xy] =
5743         pred_mode                        = decode_cabac_mb_chroma_pre_mode( h );
5744
5745         pred_mode= check_intra_pred_mode( h, pred_mode );
5746         if( pred_mode < 0 ) return -1;
5747         h->chroma_pred_mode= pred_mode;
5748     } else if( partition_count == 4 ) {
5749         int i, j, sub_partition_count[4], list, ref[2][4];
5750
5751         if( h->slice_type == FF_B_TYPE ) {
5752             for( i = 0; i < 4; i++ ) {
5753                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5754                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5755                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5756             }
5757             if( IS_DIRECT(h->sub_mb_type[0] | h->sub_mb_type[1] |
5758                           h->sub_mb_type[2] | h->sub_mb_type[3]) ) {
5759                 pred_direct_motion(h, &mb_type);
5760                 h->ref_cache[0][scan8[4]] =
5761                 h->ref_cache[1][scan8[4]] =
5762                 h->ref_cache[0][scan8[12]] =
5763                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5764                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5765                     for( i = 0; i < 4; i++ )
5766                         if( IS_DIRECT(h->sub_mb_type[i]) )
5767                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5768                 }
5769             }
5770         } else {
5771             for( i = 0; i < 4; i++ ) {
5772                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5773                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5774                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5775             }
5776         }
5777
5778         for( list = 0; list < h->list_count; list++ ) {
5779                 for( i = 0; i < 4; i++ ) {
5780                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5781                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5782                         if( h->ref_count[list] > 1 )
5783                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5784                         else
5785                             ref[list][i] = 0;
5786                     } else {
5787                         ref[list][i] = -1;
5788                     }
5789                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5790                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5791                 }
5792         }
5793
5794         if(dct8x8_allowed)
5795             dct8x8_allowed = get_dct8x8_allowed(h);
5796
5797         for(list=0; list<h->list_count; list++){
5798             for(i=0; i<4; i++){
5799                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5800                 if(IS_DIRECT(h->sub_mb_type[i])){
5801                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5802                     continue;
5803                 }
5804
5805                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5806                     const int sub_mb_type= h->sub_mb_type[i];
5807                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5808                     for(j=0; j<sub_partition_count[i]; j++){
5809                         int mpx, mpy;
5810                         int mx, my;
5811                         const int index= 4*i + block_width*j;
5812                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5813                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5814                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5815
5816                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5817                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5818                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5819
5820                         if(IS_SUB_8X8(sub_mb_type)){
5821                             mv_cache[ 1 ][0]=
5822                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5823                             mv_cache[ 1 ][1]=
5824                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5825
5826                             mvd_cache[ 1 ][0]=
5827                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5828                             mvd_cache[ 1 ][1]=
5829                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5830                         }else if(IS_SUB_8X4(sub_mb_type)){
5831                             mv_cache[ 1 ][0]= mx;
5832                             mv_cache[ 1 ][1]= my;
5833
5834                             mvd_cache[ 1 ][0]= mx - mpx;
5835                             mvd_cache[ 1 ][1]= my - mpy;
5836                         }else if(IS_SUB_4X8(sub_mb_type)){
5837                             mv_cache[ 8 ][0]= mx;
5838                             mv_cache[ 8 ][1]= my;
5839
5840                             mvd_cache[ 8 ][0]= mx - mpx;
5841                             mvd_cache[ 8 ][1]= my - mpy;
5842                         }
5843                         mv_cache[ 0 ][0]= mx;
5844                         mv_cache[ 0 ][1]= my;
5845
5846                         mvd_cache[ 0 ][0]= mx - mpx;
5847                         mvd_cache[ 0 ][1]= my - mpy;
5848                     }
5849                 }else{
5850                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5851                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5852                     p[0] = p[1] = p[8] = p[9] = 0;
5853                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5854                 }
5855             }
5856         }
5857     } else if( IS_DIRECT(mb_type) ) {
5858         pred_direct_motion(h, &mb_type);
5859         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5860         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5861         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5862     } else {
5863         int list, mx, my, i, mpx, mpy;
5864         if(IS_16X16(mb_type)){
5865             for(list=0; list<h->list_count; list++){
5866                 if(IS_DIR(mb_type, 0, list)){
5867                         const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
5868                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5869                 }else
5870                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1); //FIXME factorize and the other fill_rect below too
5871             }
5872             for(list=0; list<h->list_count; list++){
5873                 if(IS_DIR(mb_type, 0, list)){
5874                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5875
5876                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5877                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5878                     tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5879
5880                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5881                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5882                 }else
5883                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5884             }
5885         }
5886         else if(IS_16X8(mb_type)){
5887             for(list=0; list<h->list_count; list++){
5888                     for(i=0; i<2; i++){
5889                         if(IS_DIR(mb_type, i, list)){
5890                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
5891                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5892                         }else
5893                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5894                     }
5895             }
5896             for(list=0; list<h->list_count; list++){
5897                 for(i=0; i<2; i++){
5898                     if(IS_DIR(mb_type, i, list)){
5899                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5900                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5901                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5902                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5903
5904                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5905                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5906                     }else{
5907                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5908                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5909                     }
5910                 }
5911             }
5912         }else{
5913             assert(IS_8X16(mb_type));
5914             for(list=0; list<h->list_count; list++){
5915                     for(i=0; i<2; i++){
5916                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5917                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
5918                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5919                         }else
5920                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5921                     }
5922             }
5923             for(list=0; list<h->list_count; list++){
5924                 for(i=0; i<2; i++){
5925                     if(IS_DIR(mb_type, i, list)){
5926                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5927                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5928                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5929
5930                         tprintf(s->avctx, "final mv:%d %d\n", mx, my);
5931                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5932                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5933                     }else{
5934                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5935                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5936                     }
5937                 }
5938             }
5939         }
5940     }
5941
5942    if( IS_INTER( mb_type ) ) {
5943         h->chroma_pred_mode_table[mb_xy] = 0;
5944         write_back_motion( h, mb_type );
5945    }
5946
5947     if( !IS_INTRA16x16( mb_type ) ) {
5948         cbp  = decode_cabac_mb_cbp_luma( h );
5949         cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5950     }
5951
5952     h->cbp_table[mb_xy] = h->cbp = cbp;
5953
5954     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
5955         if( decode_cabac_mb_transform_size( h ) )
5956             mb_type |= MB_TYPE_8x8DCT;
5957     }
5958     s->current_picture.mb_type[mb_xy]= mb_type;
5959
5960     if( cbp || IS_INTRA16x16( mb_type ) ) {
5961         const uint8_t *scan, *scan8x8, *dc_scan;
5962         const uint32_t *qmul;
5963         int dqp;
5964
5965         if(IS_INTERLACED(mb_type)){
5966             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
5967             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5968             dc_scan= luma_dc_field_scan;
5969         }else{
5970             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
5971             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5972             dc_scan= luma_dc_zigzag_scan;
5973         }
5974
5975         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5976         if( dqp == INT_MIN ){
5977             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
5978             return -1;
5979         }
5980         s->qscale += dqp;
5981         if(((unsigned)s->qscale) > 51){
5982             if(s->qscale<0) s->qscale+= 52;
5983             else            s->qscale-= 52;
5984         }
5985         h->chroma_qp[0] = get_chroma_qp(h, 0, s->qscale);
5986         h->chroma_qp[1] = get_chroma_qp(h, 1, s->qscale);
5987
5988         if( IS_INTRA16x16( mb_type ) ) {
5989             int i;
5990             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5991             decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16);
5992
5993             if( cbp&15 ) {
5994                 qmul = h->dequant4_coeff[0][s->qscale];
5995                 for( i = 0; i < 16; i++ ) {
5996                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5997                     decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, qmul, 15);
5998                 }
5999             } else {
6000                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
6001             }
6002         } else {
6003             int i8x8, i4x4;
6004             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
6005                 if( cbp & (1<<i8x8) ) {
6006                     if( IS_8x8DCT(mb_type) ) {
6007                         decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
6008                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64);
6009                     } else {
6010                         qmul = h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale];
6011                         for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
6012                             const int index = 4*i8x8 + i4x4;
6013                             //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
6014 //START_TIMER
6015                             decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, qmul, 16);
6016 //STOP_TIMER("decode_residual")
6017                         }
6018                     }
6019                 } else {
6020                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
6021                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
6022                 }
6023             }
6024         }
6025
6026         if( cbp&0x30 ){
6027             int c;
6028             for( c = 0; c < 2; c++ ) {
6029                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
6030                 decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4);
6031             }
6032         }
6033
6034         if( cbp&0x20 ) {
6035             int c, i;
6036             for( c = 0; c < 2; c++ ) {
6037                 qmul = h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp[c]];
6038                 for( i = 0; i < 4; i++ ) {
6039                     const int index = 16 + 4 * c + i;
6040                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
6041                     decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, qmul, 15);
6042                 }
6043             }
6044         } else {
6045             uint8_t * const nnz= &h->non_zero_count_cache[0];
6046             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6047             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6048         }
6049     } else {
6050         uint8_t * const nnz= &h->non_zero_count_cache[0];
6051         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
6052         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6053         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6054         h->last_qscale_diff = 0;
6055     }
6056
6057     s->current_picture.qscale_table[mb_xy]= s->qscale;
6058     write_back_non_zero_count(h);
6059
6060     if(MB_MBAFF){
6061         h->ref_count[0] >>= 1;
6062         h->ref_count[1] >>= 1;
6063     }
6064
6065     return 0;
6066 }
6067
6068
6069 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6070     int i, d;
6071     const int index_a = qp + h->slice_alpha_c0_offset;
6072     const int alpha = (alpha_table+52)[index_a];
6073     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6074
6075     if( bS[0] < 4 ) {
6076         int8_t tc[4];
6077         for(i=0; i<4; i++)
6078             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6079         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
6080     } else {
6081         /* 16px edge length, because bS=4 is triggered by being at
6082          * the edge of an intra MB, so all 4 bS are the same */
6083             for( d = 0; d < 16; d++ ) {
6084                 const int p0 = pix[-1];
6085                 const int p1 = pix[-2];
6086                 const int p2 = pix[-3];
6087
6088                 const int q0 = pix[0];
6089                 const int q1 = pix[1];
6090                 const int q2 = pix[2];
6091
6092                 if( FFABS( p0 - q0 ) < alpha &&
6093                     FFABS( p1 - p0 ) < beta &&
6094                     FFABS( q1 - q0 ) < beta ) {
6095
6096                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6097                         if( FFABS( p2 - p0 ) < beta)
6098                         {
6099                             const int p3 = pix[-4];
6100                             /* p0', p1', p2' */
6101                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6102                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6103                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6104                         } else {
6105                             /* p0' */
6106                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6107                         }
6108                         if( FFABS( q2 - q0 ) < beta)
6109                         {
6110                             const int q3 = pix[3];
6111                             /* q0', q1', q2' */
6112                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6113                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6114                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6115                         } else {
6116                             /* q0' */
6117                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6118                         }
6119                     }else{
6120                         /* p0', q0' */
6121                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6122                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6123                     }
6124                     tprintf(h->s.avctx, "filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
6125                 }
6126                 pix += stride;
6127             }
6128     }
6129 }
6130 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6131     int i;
6132     const int index_a = qp + h->slice_alpha_c0_offset;
6133     const int alpha = (alpha_table+52)[index_a];
6134     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6135
6136     if( bS[0] < 4 ) {
6137         int8_t tc[4];
6138         for(i=0; i<4; i++)
6139             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6140         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
6141     } else {
6142         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
6143     }
6144 }
6145
6146 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6147     int i;
6148     for( i = 0; i < 16; i++, pix += stride) {
6149         int index_a;
6150         int alpha;
6151         int beta;
6152
6153         int qp_index;
6154         int bS_index = (i >> 1);
6155         if (!MB_FIELD) {
6156             bS_index &= ~1;
6157             bS_index |= (i & 1);
6158         }
6159
6160         if( bS[bS_index] == 0 ) {
6161             continue;
6162         }
6163
6164         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
6165         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6166         alpha = (alpha_table+52)[index_a];
6167         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6168
6169         if( bS[bS_index] < 4 ) {
6170             const int tc0 = (tc0_table+52)[index_a][bS[bS_index] - 1];
6171             const int p0 = pix[-1];
6172             const int p1 = pix[-2];
6173             const int p2 = pix[-3];
6174             const int q0 = pix[0];
6175             const int q1 = pix[1];
6176             const int q2 = pix[2];
6177
6178             if( FFABS( p0 - q0 ) < alpha &&
6179                 FFABS( p1 - p0 ) < beta &&
6180                 FFABS( q1 - q0 ) < beta ) {
6181                 int tc = tc0;
6182                 int i_delta;
6183
6184                 if( FFABS( p2 - p0 ) < beta ) {
6185                     pix[-2] = p1 + av_clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6186                     tc++;
6187                 }
6188                 if( FFABS( q2 - q0 ) < beta ) {
6189                     pix[1] = q1 + av_clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6190                     tc++;
6191                 }
6192
6193                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6194                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6195                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6196                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6197             }
6198         }else{
6199             const int p0 = pix[-1];
6200             const int p1 = pix[-2];
6201             const int p2 = pix[-3];
6202
6203             const int q0 = pix[0];
6204             const int q1 = pix[1];
6205             const int q2 = pix[2];
6206
6207             if( FFABS( p0 - q0 ) < alpha &&
6208                 FFABS( p1 - p0 ) < beta &&
6209                 FFABS( q1 - q0 ) < beta ) {
6210
6211                 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6212                     if( FFABS( p2 - p0 ) < beta)
6213                     {
6214                         const int p3 = pix[-4];
6215                         /* p0', p1', p2' */
6216                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6217                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6218                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6219                     } else {
6220                         /* p0' */
6221                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6222                     }
6223                     if( FFABS( q2 - q0 ) < beta)
6224                     {
6225                         const int q3 = pix[3];
6226                         /* q0', q1', q2' */
6227                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6228                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6229                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6230                     } else {
6231                         /* q0' */
6232                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6233                     }
6234                 }else{
6235                     /* p0', q0' */
6236                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6237                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6238                 }
6239                 tprintf(h->s.avctx, "filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6240             }
6241         }
6242     }
6243 }
6244 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int16_t bS[8], int qp[2] ) {
6245     int i;
6246     for( i = 0; i < 8; i++, pix += stride) {
6247         int index_a;
6248         int alpha;
6249         int beta;
6250
6251         int qp_index;
6252         int bS_index = i;
6253
6254         if( bS[bS_index] == 0 ) {
6255             continue;
6256         }
6257
6258         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6259         index_a = qp[qp_index] + h->slice_alpha_c0_offset;
6260         alpha = (alpha_table+52)[index_a];
6261         beta  = (beta_table+52)[qp[qp_index] + h->slice_beta_offset];
6262
6263         if( bS[bS_index] < 4 ) {
6264             const int tc = (tc0_table+52)[index_a][bS[bS_index] - 1] + 1;
6265             const int p0 = pix[-1];
6266             const int p1 = pix[-2];
6267             const int q0 = pix[0];
6268             const int q1 = pix[1];
6269
6270             if( FFABS( p0 - q0 ) < alpha &&
6271                 FFABS( p1 - p0 ) < beta &&
6272                 FFABS( q1 - q0 ) < beta ) {
6273                 const int i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6274
6275                 pix[-1] = av_clip_uint8( p0 + i_delta );    /* p0' */
6276                 pix[0]  = av_clip_uint8( q0 - i_delta );    /* q0' */
6277                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6278             }
6279         }else{
6280             const int p0 = pix[-1];
6281             const int p1 = pix[-2];
6282             const int q0 = pix[0];
6283             const int q1 = pix[1];
6284
6285             if( FFABS( p0 - q0 ) < alpha &&
6286                 FFABS( p1 - p0 ) < beta &&
6287                 FFABS( q1 - q0 ) < beta ) {
6288
6289                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6290                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6291                 tprintf(h->s.avctx, "filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6292             }
6293         }
6294     }
6295 }
6296
6297 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6298     int i, d;
6299     const int index_a = qp + h->slice_alpha_c0_offset;
6300     const int alpha = (alpha_table+52)[index_a];
6301     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6302     const int pix_next  = stride;
6303
6304     if( bS[0] < 4 ) {
6305         int8_t tc[4];
6306         for(i=0; i<4; i++)
6307             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] : -1;
6308         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6309     } else {
6310         /* 16px edge length, see filter_mb_edgev */
6311             for( d = 0; d < 16; d++ ) {
6312                 const int p0 = pix[-1*pix_next];
6313                 const int p1 = pix[-2*pix_next];
6314                 const int p2 = pix[-3*pix_next];
6315                 const int q0 = pix[0];
6316                 const int q1 = pix[1*pix_next];
6317                 const int q2 = pix[2*pix_next];
6318
6319                 if( FFABS( p0 - q0 ) < alpha &&
6320                     FFABS( p1 - p0 ) < beta &&
6321                     FFABS( q1 - q0 ) < beta ) {
6322
6323                     const int p3 = pix[-4*pix_next];
6324                     const int q3 = pix[ 3*pix_next];
6325
6326                     if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6327                         if( FFABS( p2 - p0 ) < beta) {
6328                             /* p0', p1', p2' */
6329                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6330                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6331                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6332                         } else {
6333                             /* p0' */
6334                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6335                         }
6336                         if( FFABS( q2 - q0 ) < beta) {
6337                             /* q0', q1', q2' */
6338                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6339                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6340                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6341                         } else {
6342                             /* q0' */
6343                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6344                         }
6345                     }else{
6346                         /* p0', q0' */
6347                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6348                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6349                     }
6350                     tprintf(h->s.avctx, "filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6351                 }
6352                 pix++;
6353             }
6354     }
6355 }
6356
6357 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int16_t bS[4], int qp ) {
6358     int i;
6359     const int index_a = qp + h->slice_alpha_c0_offset;
6360     const int alpha = (alpha_table+52)[index_a];
6361     const int beta  = (beta_table+52)[qp + h->slice_beta_offset];
6362
6363     if( bS[0] < 4 ) {
6364         int8_t tc[4];
6365         for(i=0; i<4; i++)
6366             tc[i] = bS[i] ? (tc0_table+52)[index_a][bS[i] - 1] + 1 : 0;
6367         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6368     } else {
6369         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6370     }
6371 }
6372
6373 static void filter_mb_fast( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6374     MpegEncContext * const s = &h->s;
6375     int mb_y_firstrow = s->picture_structure == PICT_BOTTOM_FIELD;
6376     int mb_xy, mb_type;
6377     int qp, qp0, qp1, qpc, qpc0, qpc1, qp_thresh;
6378
6379     mb_xy = mb_x + mb_y*s->mb_stride;
6380
6381     if(mb_x==0 || mb_y==mb_y_firstrow || !s->dsp.h264_loop_filter_strength || h->pps.chroma_qp_diff ||
6382        (h->deblocking_filter == 2 && (h->slice_table[mb_xy] != h->slice_table[h->top_mb_xy] ||
6383                                       h->slice_table[mb_xy] != h->slice_table[mb_xy - 1]))) {
6384         filter_mb(h, mb_x, mb_y, img_y, img_cb, img_cr, linesize, uvlinesize);
6385         return;
6386     }
6387     assert(!FRAME_MBAFF);
6388
6389     mb_type = s->current_picture.mb_type[mb_xy];
6390     qp = s->current_picture.qscale_table[mb_xy];
6391     qp0 = s->current_picture.qscale_table[mb_xy-1];
6392     qp1 = s->current_picture.qscale_table[h->top_mb_xy];
6393     qpc = get_chroma_qp( h, 0, qp );
6394     qpc0 = get_chroma_qp( h, 0, qp0 );
6395     qpc1 = get_chroma_qp( h, 0, qp1 );
6396     qp0 = (qp + qp0 + 1) >> 1;
6397     qp1 = (qp + qp1 + 1) >> 1;
6398     qpc0 = (qpc + qpc0 + 1) >> 1;
6399     qpc1 = (qpc + qpc1 + 1) >> 1;
6400     qp_thresh = 15 - h->slice_alpha_c0_offset;
6401     if(qp <= qp_thresh && qp0 <= qp_thresh && qp1 <= qp_thresh &&
6402        qpc <= qp_thresh && qpc0 <= qp_thresh && qpc1 <= qp_thresh)
6403         return;
6404
6405     if( IS_INTRA(mb_type) ) {
6406         int16_t bS4[4] = {4,4,4,4};
6407         int16_t bS3[4] = {3,3,3,3};
6408         int16_t *bSH = FIELD_PICTURE ? bS3 : bS4;
6409         if( IS_8x8DCT(mb_type) ) {
6410             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6411             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6412             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6413             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6414         } else {
6415             filter_mb_edgev( h, &img_y[4*0], linesize, bS4, qp0 );
6416             filter_mb_edgev( h, &img_y[4*1], linesize, bS3, qp );
6417             filter_mb_edgev( h, &img_y[4*2], linesize, bS3, qp );
6418             filter_mb_edgev( h, &img_y[4*3], linesize, bS3, qp );
6419             filter_mb_edgeh( h, &img_y[4*0*linesize], linesize, bSH, qp1 );
6420             filter_mb_edgeh( h, &img_y[4*1*linesize], linesize, bS3, qp );
6421             filter_mb_edgeh( h, &img_y[4*2*linesize], linesize, bS3, qp );
6422             filter_mb_edgeh( h, &img_y[4*3*linesize], linesize, bS3, qp );
6423         }
6424         filter_mb_edgecv( h, &img_cb[2*0], uvlinesize, bS4, qpc0 );
6425         filter_mb_edgecv( h, &img_cb[2*2], uvlinesize, bS3, qpc );
6426         filter_mb_edgecv( h, &img_cr[2*0], uvlinesize, bS4, qpc0 );
6427         filter_mb_edgecv( h, &img_cr[2*2], uvlinesize, bS3, qpc );
6428         filter_mb_edgech( h, &img_cb[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6429         filter_mb_edgech( h, &img_cb[2*2*uvlinesize], uvlinesize, bS3, qpc );
6430         filter_mb_edgech( h, &img_cr[2*0*uvlinesize], uvlinesize, bSH, qpc1 );
6431         filter_mb_edgech( h, &img_cr[2*2*uvlinesize], uvlinesize, bS3, qpc );
6432         return;
6433     } else {
6434         DECLARE_ALIGNED_8(int16_t, bS[2][4][4]);
6435         uint64_t (*bSv)[4] = (uint64_t(*)[4])bS;
6436         int edges;
6437         if( IS_8x8DCT(mb_type) && (h->cbp&7) == 7 ) {
6438             edges = 4;
6439             bSv[0][0] = bSv[0][2] = bSv[1][0] = bSv[1][2] = 0x0002000200020002ULL;
6440         } else {
6441             int mask_edge1 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16)) ? 3 :
6442                              (mb_type & MB_TYPE_16x8) ? 1 : 0;
6443             int mask_edge0 = (mb_type & (MB_TYPE_16x16 | MB_TYPE_8x16))
6444                              && (s->current_picture.mb_type[mb_xy-1] & (MB_TYPE_16x16 | MB_TYPE_8x16))
6445                              ? 3 : 0;
6446             int step = IS_8x8DCT(mb_type) ? 2 : 1;
6447             edges = (mb_type & MB_TYPE_16x16) && !(h->cbp & 15) ? 1 : 4;
6448             s->dsp.h264_loop_filter_strength( bS, h->non_zero_count_cache, h->ref_cache, h->mv_cache,
6449                                               (h->slice_type == FF_B_TYPE), edges, step, mask_edge0, mask_edge1 );
6450         }
6451         if( IS_INTRA(s->current_picture.mb_type[mb_xy-1]) )
6452             bSv[0][0] = 0x0004000400040004ULL;
6453         if( IS_INTRA(s->current_picture.mb_type[h->top_mb_xy]) )
6454             bSv[1][0] = FIELD_PICTURE ? 0x0003000300030003ULL : 0x0004000400040004ULL;
6455
6456 #define FILTER(hv,dir,edge)\
6457         if(bSv[dir][edge]) {\
6458             filter_mb_edge##hv( h, &img_y[4*edge*(dir?linesize:1)], linesize, bS[dir][edge], edge ? qp : qp##dir );\
6459             if(!(edge&1)) {\
6460                 filter_mb_edgec##hv( h, &img_cb[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6461                 filter_mb_edgec##hv( h, &img_cr[2*edge*(dir?uvlinesize:1)], uvlinesize, bS[dir][edge], edge ? qpc : qpc##dir );\
6462             }\
6463         }
6464         if( edges == 1 ) {
6465             FILTER(v,0,0);
6466             FILTER(h,1,0);
6467         } else if( IS_8x8DCT(mb_type) ) {
6468             FILTER(v,0,0);
6469             FILTER(v,0,2);
6470             FILTER(h,1,0);
6471             FILTER(h,1,2);
6472         } else {
6473             FILTER(v,0,0);
6474             FILTER(v,0,1);
6475             FILTER(v,0,2);
6476             FILTER(v,0,3);
6477             FILTER(h,1,0);
6478             FILTER(h,1,1);
6479             FILTER(h,1,2);
6480             FILTER(h,1,3);
6481         }
6482 #undef FILTER
6483     }
6484 }
6485
6486 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6487     MpegEncContext * const s = &h->s;
6488     const int mb_xy= mb_x + mb_y*s->mb_stride;
6489     const int mb_type = s->current_picture.mb_type[mb_xy];
6490     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
6491     int first_vertical_edge_done = 0;
6492     int dir;
6493     /* FIXME: A given frame may occupy more than one position in
6494      * the reference list. So ref2frm should be populated with
6495      * frame numbers, not indices. */
6496     static const int ref2frm[34] = {-1,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
6497                                     16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
6498
6499     //for sufficiently low qp, filtering wouldn't do anything
6500     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6501     if(!FRAME_MBAFF){
6502         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX(0, FFMAX(h->pps.chroma_qp_index_offset[0], h->pps.chroma_qp_index_offset[1]));
6503         int qp = s->current_picture.qscale_table[mb_xy];
6504         if(qp <= qp_thresh
6505            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6506            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6507             return;
6508         }
6509     }
6510
6511     if (FRAME_MBAFF
6512             // left mb is in picture
6513             && h->slice_table[mb_xy-1] != 255
6514             // and current and left pair do not have the same interlaced type
6515             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6516             // and left mb is in the same slice if deblocking_filter == 2
6517             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6518         /* First vertical edge is different in MBAFF frames
6519          * There are 8 different bS to compute and 2 different Qp
6520          */
6521         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
6522         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
6523         int16_t bS[8];
6524         int qp[2];
6525         int bqp[2];
6526         int rqp[2];
6527         int mb_qp, mbn0_qp, mbn1_qp;
6528         int i;
6529         first_vertical_edge_done = 1;
6530
6531         if( IS_INTRA(mb_type) )
6532             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
6533         else {
6534             for( i = 0; i < 8; i++ ) {
6535                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
6536
6537                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
6538                     bS[i] = 4;
6539                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
6540                          /* FIXME: with 8x8dct + cavlc, should check cbp instead of nnz */
6541                          h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2] )
6542                     bS[i] = 2;
6543                 else
6544                     bS[i] = 1;
6545             }
6546         }
6547
6548         mb_qp = s->current_picture.qscale_table[mb_xy];
6549         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
6550         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
6551         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
6552         bqp[0] = ( get_chroma_qp( h, 0, mb_qp ) +
6553                    get_chroma_qp( h, 0, mbn0_qp ) + 1 ) >> 1;
6554         rqp[0] = ( get_chroma_qp( h, 1, mb_qp ) +
6555                    get_chroma_qp( h, 1, mbn0_qp ) + 1 ) >> 1;
6556         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
6557         bqp[1] = ( get_chroma_qp( h, 0, mb_qp ) +
6558                    get_chroma_qp( h, 0, mbn1_qp ) + 1 ) >> 1;
6559         rqp[1] = ( get_chroma_qp( h, 1, mb_qp ) +
6560                    get_chroma_qp( h, 1, mbn1_qp ) + 1 ) >> 1;
6561
6562         /* Filter edge */
6563         tprintf(s->avctx, "filter mb:%d/%d MBAFF, QPy:%d/%d, QPb:%d/%d QPr:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], bqp[0], bqp[1], rqp[0], rqp[1], linesize, uvlinesize);
6564         { int i; for (i = 0; i < 8; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6565         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6566         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, bqp );
6567         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, rqp );
6568     }
6569     /* dir : 0 -> vertical edge, 1 -> horizontal edge */
6570     for( dir = 0; dir < 2; dir++ )
6571     {
6572         int edge;
6573         const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6574         const int mbm_type = s->current_picture.mb_type[mbm_xy];
6575         int start = h->slice_table[mbm_xy] == 255 ? 1 : 0;
6576
6577         const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6578                                   == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6579         // how often to recheck mv-based bS when iterating between edges
6580         const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6581                               (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6582         // how often to recheck mv-based bS when iterating along each edge
6583         const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6584
6585         if (first_vertical_edge_done) {
6586             start = 1;
6587             first_vertical_edge_done = 0;
6588         }
6589
6590         if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6591             start = 1;
6592
6593         if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
6594             && !IS_INTERLACED(mb_type)
6595             && IS_INTERLACED(mbm_type)
6596             ) {
6597             // This is a special case in the norm where the filtering must
6598             // be done twice (one each of the field) even if we are in a
6599             // frame macroblock.
6600             //
6601             static const int nnz_idx[4] = {4,5,6,3};
6602             unsigned int tmp_linesize   = 2 *   linesize;
6603             unsigned int tmp_uvlinesize = 2 * uvlinesize;
6604             int mbn_xy = mb_xy - 2 * s->mb_stride;
6605             int qp;
6606             int i, j;
6607             int16_t bS[4];
6608
6609             for(j=0; j<2; j++, mbn_xy += s->mb_stride){
6610                 if( IS_INTRA(mb_type) ||
6611                     IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6612                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
6613                 } else {
6614                     const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
6615                     for( i = 0; i < 4; i++ ) {
6616                         if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
6617                             mbn_nnz[nnz_idx[i]] != 0 )
6618                             bS[i] = 2;
6619                         else
6620                             bS[i] = 1;
6621                     }
6622                 }
6623                 // Do not use s->qscale as luma quantizer because it has not the same
6624                 // value in IPCM macroblocks.
6625                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6626                 tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6627                 { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6628                 filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
6629                 filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS,
6630                                   ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6631                 filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS,
6632                                   ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6633             }
6634
6635             start = 1;
6636         }
6637
6638         /* Calculate bS */
6639         for( edge = start; edge < edges; edge++ ) {
6640             /* mbn_xy: neighbor macroblock */
6641             const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6642             const int mbn_type = s->current_picture.mb_type[mbn_xy];
6643             int16_t bS[4];
6644             int qp;
6645
6646             if( (edge&1) && IS_8x8DCT(mb_type) )
6647                 continue;
6648
6649             if( IS_INTRA(mb_type) ||
6650                 IS_INTRA(mbn_type) ) {
6651                 int value;
6652                 if (edge == 0) {
6653                     if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6654                         || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6655                     ) {
6656                         value = 4;
6657                     } else {
6658                         value = 3;
6659                     }
6660                 } else {
6661                     value = 3;
6662                 }
6663                 bS[0] = bS[1] = bS[2] = bS[3] = value;
6664             } else {
6665                 int i, l;
6666                 int mv_done;
6667
6668                 if( edge & mask_edge ) {
6669                     bS[0] = bS[1] = bS[2] = bS[3] = 0;
6670                     mv_done = 1;
6671                 }
6672                 else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
6673                     bS[0] = bS[1] = bS[2] = bS[3] = 1;
6674                     mv_done = 1;
6675                 }
6676                 else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6677                     int b_idx= 8 + 4 + edge * (dir ? 8:1);
6678                     int bn_idx= b_idx - (dir ? 8:1);
6679                     int v = 0;
6680                     for( l = 0; !v && l < 1 + (h->slice_type == FF_B_TYPE); l++ ) {
6681                         v |= ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
6682                              FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6683                              FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
6684                     }
6685                     bS[0] = bS[1] = bS[2] = bS[3] = v;
6686                     mv_done = 1;
6687                 }
6688                 else
6689                     mv_done = 0;
6690
6691                 for( i = 0; i < 4; i++ ) {
6692                     int x = dir == 0 ? edge : i;
6693                     int y = dir == 0 ? i    : edge;
6694                     int b_idx= 8 + 4 + x + 8*y;
6695                     int bn_idx= b_idx - (dir ? 8:1);
6696
6697                     if( h->non_zero_count_cache[b_idx] != 0 ||
6698                         h->non_zero_count_cache[bn_idx] != 0 ) {
6699                         bS[i] = 2;
6700                     }
6701                     else if(!mv_done)
6702                     {
6703                         bS[i] = 0;
6704                         for( l = 0; l < 1 + (h->slice_type == FF_B_TYPE); l++ ) {
6705                             if( ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
6706                                 FFABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6707                                 FFABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
6708                                 bS[i] = 1;
6709                                 break;
6710                             }
6711                         }
6712                     }
6713                 }
6714
6715                 if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6716                     continue;
6717             }
6718
6719             /* Filter edge */
6720             // Do not use s->qscale as luma quantizer because it has not the same
6721             // value in IPCM macroblocks.
6722             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6723             //tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6724             tprintf(s->avctx, "filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6725             { int i; for (i = 0; i < 4; i++) tprintf(s->avctx, " bS[%d]:%d", i, bS[i]); tprintf(s->avctx, "\n"); }
6726             if( dir == 0 ) {
6727                 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6728                 if( (edge&1) == 0 ) {
6729                     filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS,
6730                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6731                     filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS,
6732                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6733                 }
6734             } else {
6735                 filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6736                 if( (edge&1) == 0 ) {
6737                     filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS,
6738                                       ( h->chroma_qp[0] + get_chroma_qp( h, 0, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6739                     filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS,
6740                                       ( h->chroma_qp[1] + get_chroma_qp( h, 1, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1);
6741                 }
6742             }
6743         }
6744     }
6745 }
6746
6747 static int decode_slice(struct AVCodecContext *avctx, H264Context *h){
6748     MpegEncContext * const s = &h->s;
6749     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6750
6751     s->mb_skip_run= -1;
6752
6753     if( h->pps.cabac ) {
6754         int i;
6755
6756         /* realign */
6757         align_get_bits( &s->gb );
6758
6759         /* init cabac */
6760         ff_init_cabac_states( &h->cabac);
6761         ff_init_cabac_decoder( &h->cabac,
6762                                s->gb.buffer + get_bits_count(&s->gb)/8,
6763                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6764         /* calculate pre-state */
6765         for( i= 0; i < 460; i++ ) {
6766             int pre;
6767             if( h->slice_type == FF_I_TYPE )
6768                 pre = av_clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6769             else
6770                 pre = av_clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6771
6772             if( pre <= 63 )
6773                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6774             else
6775                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6776         }
6777
6778         for(;;){
6779 //START_TIMER
6780             int ret = decode_mb_cabac(h);
6781             int eos;
6782 //STOP_TIMER("decode_mb_cabac")
6783
6784             if(ret>=0) hl_decode_mb(h);
6785
6786             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6787                 s->mb_y++;
6788
6789                 if(ret>=0) ret = decode_mb_cabac(h);
6790
6791                 if(ret>=0) hl_decode_mb(h);
6792                 s->mb_y--;
6793             }
6794             eos = get_cabac_terminate( &h->cabac );
6795
6796             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 2) {
6797                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%td)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
6798                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6799                 return -1;
6800             }
6801
6802             if( ++s->mb_x >= s->mb_width ) {
6803                 s->mb_x = 0;
6804                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6805                 ++s->mb_y;
6806                 if(FIELD_OR_MBAFF_PICTURE) {
6807                     ++s->mb_y;
6808                 }
6809             }
6810
6811             if( eos || s->mb_y >= s->mb_height ) {
6812                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6813                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6814                 return 0;
6815             }
6816         }
6817
6818     } else {
6819         for(;;){
6820             int ret = decode_mb_cavlc(h);
6821
6822             if(ret>=0) hl_decode_mb(h);
6823
6824             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
6825                 s->mb_y++;
6826                 ret = decode_mb_cavlc(h);
6827
6828                 if(ret>=0) hl_decode_mb(h);
6829                 s->mb_y--;
6830             }
6831
6832             if(ret<0){
6833                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6834                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6835
6836                 return -1;
6837             }
6838
6839             if(++s->mb_x >= s->mb_width){
6840                 s->mb_x=0;
6841                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6842                 ++s->mb_y;
6843                 if(FIELD_OR_MBAFF_PICTURE) {
6844                     ++s->mb_y;
6845                 }
6846                 if(s->mb_y >= s->mb_height){
6847                     tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6848
6849                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6850                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6851
6852                         return 0;
6853                     }else{
6854                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6855
6856                         return -1;
6857                     }
6858                 }
6859             }
6860
6861             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6862                 tprintf(s->avctx, "slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6863                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6864                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6865
6866                     return 0;
6867                 }else{
6868                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6869
6870                     return -1;
6871                 }
6872             }
6873         }
6874     }
6875
6876 #if 0
6877     for(;s->mb_y < s->mb_height; s->mb_y++){
6878         for(;s->mb_x < s->mb_width; s->mb_x++){
6879             int ret= decode_mb(h);
6880
6881             hl_decode_mb(h);
6882
6883             if(ret<0){
6884                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6885                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6886
6887                 return -1;
6888             }
6889
6890             if(++s->mb_x >= s->mb_width){
6891                 s->mb_x=0;
6892                 if(++s->mb_y >= s->mb_height){
6893                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6894                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6895
6896                         return 0;
6897                     }else{
6898                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6899
6900                         return -1;
6901                     }
6902                 }
6903             }
6904
6905             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6906                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6907                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6908
6909                     return 0;
6910                 }else{
6911                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6912
6913                     return -1;
6914                 }
6915             }
6916         }
6917         s->mb_x=0;
6918         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6919     }
6920 #endif
6921     return -1; //not reached
6922 }
6923
6924 static int decode_unregistered_user_data(H264Context *h, int size){
6925     MpegEncContext * const s = &h->s;
6926     uint8_t user_data[16+256];
6927     int e, build, i;
6928
6929     if(size<16)
6930         return -1;
6931
6932     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
6933         user_data[i]= get_bits(&s->gb, 8);
6934     }
6935
6936     user_data[i]= 0;
6937     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
6938     if(e==1 && build>=0)
6939         h->x264_build= build;
6940
6941     if(s->avctx->debug & FF_DEBUG_BUGS)
6942         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
6943
6944     for(; i<size; i++)
6945         skip_bits(&s->gb, 8);
6946
6947     return 0;
6948 }
6949
6950 static int decode_sei(H264Context *h){
6951     MpegEncContext * const s = &h->s;
6952
6953     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
6954         int size, type;
6955
6956         type=0;
6957         do{
6958             type+= show_bits(&s->gb, 8);
6959         }while(get_bits(&s->gb, 8) == 255);
6960
6961         size=0;
6962         do{
6963             size+= show_bits(&s->gb, 8);
6964         }while(get_bits(&s->gb, 8) == 255);
6965
6966         switch(type){
6967         case 5:
6968             if(decode_unregistered_user_data(h, size) < 0)
6969                 return -1;
6970             break;
6971         default:
6972             skip_bits(&s->gb, 8*size);
6973         }
6974
6975         //FIXME check bits here
6976         align_get_bits(&s->gb);
6977     }
6978
6979     return 0;
6980 }
6981
6982 static inline void decode_hrd_parameters(H264Context *h, SPS *sps){
6983     MpegEncContext * const s = &h->s;
6984     int cpb_count, i;
6985     cpb_count = get_ue_golomb(&s->gb) + 1;
6986     get_bits(&s->gb, 4); /* bit_rate_scale */
6987     get_bits(&s->gb, 4); /* cpb_size_scale */
6988     for(i=0; i<cpb_count; i++){
6989         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
6990         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
6991         get_bits1(&s->gb);     /* cbr_flag */
6992     }
6993     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
6994     get_bits(&s->gb, 5); /* cpb_removal_delay_length_minus1 */
6995     get_bits(&s->gb, 5); /* dpb_output_delay_length_minus1 */
6996     get_bits(&s->gb, 5); /* time_offset_length */
6997 }
6998
6999 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
7000     MpegEncContext * const s = &h->s;
7001     int aspect_ratio_info_present_flag;
7002     unsigned int aspect_ratio_idc;
7003     int nal_hrd_parameters_present_flag, vcl_hrd_parameters_present_flag;
7004
7005     aspect_ratio_info_present_flag= get_bits1(&s->gb);
7006
7007     if( aspect_ratio_info_present_flag ) {
7008         aspect_ratio_idc= get_bits(&s->gb, 8);
7009         if( aspect_ratio_idc == EXTENDED_SAR ) {
7010             sps->sar.num= get_bits(&s->gb, 16);
7011             sps->sar.den= get_bits(&s->gb, 16);
7012         }else if(aspect_ratio_idc < sizeof(pixel_aspect)/sizeof(*pixel_aspect)){
7013             sps->sar=  pixel_aspect[aspect_ratio_idc];
7014         }else{
7015             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
7016             return -1;
7017         }
7018     }else{
7019         sps->sar.num=
7020         sps->sar.den= 0;
7021     }
7022 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
7023
7024     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
7025         get_bits1(&s->gb);      /* overscan_appropriate_flag */
7026     }
7027
7028     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
7029         get_bits(&s->gb, 3);    /* video_format */
7030         get_bits1(&s->gb);      /* video_full_range_flag */
7031         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
7032             get_bits(&s->gb, 8); /* colour_primaries */
7033             get_bits(&s->gb, 8); /* transfer_characteristics */
7034             get_bits(&s->gb, 8); /* matrix_coefficients */
7035         }
7036     }
7037
7038     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
7039         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
7040         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
7041     }
7042
7043     sps->timing_info_present_flag = get_bits1(&s->gb);
7044     if(sps->timing_info_present_flag){
7045         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
7046         sps->time_scale = get_bits_long(&s->gb, 32);
7047         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
7048     }
7049
7050     nal_hrd_parameters_present_flag = get_bits1(&s->gb);
7051     if(nal_hrd_parameters_present_flag)
7052         decode_hrd_parameters(h, sps);
7053     vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
7054     if(vcl_hrd_parameters_present_flag)
7055         decode_hrd_parameters(h, sps);
7056     if(nal_hrd_parameters_present_flag || vcl_hrd_parameters_present_flag)
7057         get_bits1(&s->gb);     /* low_delay_hrd_flag */
7058     get_bits1(&s->gb);         /* pic_struct_present_flag */
7059
7060     sps->bitstream_restriction_flag = get_bits1(&s->gb);
7061     if(sps->bitstream_restriction_flag){
7062         unsigned int num_reorder_frames;
7063         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
7064         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
7065         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
7066         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
7067         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
7068         num_reorder_frames= get_ue_golomb(&s->gb);
7069         get_ue_golomb(&s->gb); /*max_dec_frame_buffering*/
7070
7071         if(num_reorder_frames > 16 /*max_dec_frame_buffering || max_dec_frame_buffering > 16*/){
7072             av_log(h->s.avctx, AV_LOG_ERROR, "illegal num_reorder_frames %d\n", num_reorder_frames);
7073             return -1;
7074         }
7075
7076         sps->num_reorder_frames= num_reorder_frames;
7077     }
7078
7079     return 0;
7080 }
7081
7082 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
7083                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
7084     MpegEncContext * const s = &h->s;
7085     int i, last = 8, next = 8;
7086     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
7087     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
7088         memcpy(factors, fallback_list, size*sizeof(uint8_t));
7089     else
7090     for(i=0;i<size;i++){
7091         if(next)
7092             next = (last + get_se_golomb(&s->gb)) & 0xff;
7093         if(!i && !next){ /* matrix not written, we use the preset one */
7094             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7095             break;
7096         }
7097         last = factors[scan[i]] = next ? next : last;
7098     }
7099 }
7100
7101 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7102                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7103     MpegEncContext * const s = &h->s;
7104     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7105     const uint8_t *fallback[4] = {
7106         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7107         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7108         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7109         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7110     };
7111     if(get_bits1(&s->gb)){
7112         sps->scaling_matrix_present |= is_sps;
7113         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7114         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7115         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7116         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7117         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7118         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7119         if(is_sps || pps->transform_8x8_mode){
7120             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7121             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7122         }
7123     } else if(fallback_sps) {
7124         memcpy(scaling_matrix4, sps->scaling_matrix4, 6*16*sizeof(uint8_t));
7125         memcpy(scaling_matrix8, sps->scaling_matrix8, 2*64*sizeof(uint8_t));
7126     }
7127 }
7128
7129 /**
7130  * Returns and optionally allocates SPS / PPS structures in the supplied array 'vec'
7131  */
7132 static void *
7133 alloc_parameter_set(H264Context *h, void **vec, const unsigned int id, const unsigned int max,
7134                     const size_t size, const char *name)
7135 {
7136     if(id>=max) {
7137         av_log(h->s.avctx, AV_LOG_ERROR, "%s_id (%d) out of range\n", name, id);
7138         return NULL;
7139     }
7140
7141     if(!vec[id]) {
7142         vec[id] = av_mallocz(size);
7143         if(vec[id] == NULL)
7144             av_log(h->s.avctx, AV_LOG_ERROR, "cannot allocate memory for %s\n", name);
7145     }
7146     return vec[id];
7147 }
7148
7149 static inline int decode_seq_parameter_set(H264Context *h){
7150     MpegEncContext * const s = &h->s;
7151     int profile_idc, level_idc;
7152     unsigned int sps_id, tmp, mb_width, mb_height;
7153     int i;
7154     SPS *sps;
7155
7156     profile_idc= get_bits(&s->gb, 8);
7157     get_bits1(&s->gb);   //constraint_set0_flag
7158     get_bits1(&s->gb);   //constraint_set1_flag
7159     get_bits1(&s->gb);   //constraint_set2_flag
7160     get_bits1(&s->gb);   //constraint_set3_flag
7161     get_bits(&s->gb, 4); // reserved
7162     level_idc= get_bits(&s->gb, 8);
7163     sps_id= get_ue_golomb(&s->gb);
7164
7165     sps = alloc_parameter_set(h, (void **)h->sps_buffers, sps_id, MAX_SPS_COUNT, sizeof(SPS), "sps");
7166     if(sps == NULL)
7167         return -1;
7168
7169     sps->profile_idc= profile_idc;
7170     sps->level_idc= level_idc;
7171
7172     if(sps->profile_idc >= 100){ //high profile
7173         if(get_ue_golomb(&s->gb) == 3) //chroma_format_idc
7174             get_bits1(&s->gb);  //residual_color_transform_flag
7175         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
7176         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
7177         sps->transform_bypass = get_bits1(&s->gb);
7178         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7179     }else
7180         sps->scaling_matrix_present = 0;
7181
7182     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7183     sps->poc_type= get_ue_golomb(&s->gb);
7184
7185     if(sps->poc_type == 0){ //FIXME #define
7186         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7187     } else if(sps->poc_type == 1){//FIXME #define
7188         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7189         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7190         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7191         tmp= get_ue_golomb(&s->gb);
7192
7193         if(tmp >= sizeof(sps->offset_for_ref_frame) / sizeof(sps->offset_for_ref_frame[0])){
7194             av_log(h->s.avctx, AV_LOG_ERROR, "poc_cycle_length overflow %u\n", tmp);
7195             return -1;
7196         }
7197         sps->poc_cycle_length= tmp;
7198
7199         for(i=0; i<sps->poc_cycle_length; i++)
7200             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7201     }else if(sps->poc_type != 2){
7202         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7203         return -1;
7204     }
7205
7206     tmp= get_ue_golomb(&s->gb);
7207     if(tmp > MAX_PICTURE_COUNT-2 || tmp >= 32){
7208         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7209         return -1;
7210     }
7211     sps->ref_frame_count= tmp;
7212     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7213     mb_width= get_ue_golomb(&s->gb) + 1;
7214     mb_height= get_ue_golomb(&s->gb) + 1;
7215     if(mb_width >= INT_MAX/16 || mb_height >= INT_MAX/16 ||
7216        avcodec_check_dimensions(NULL, 16*mb_width, 16*mb_height)){
7217         av_log(h->s.avctx, AV_LOG_ERROR, "mb_width/height overflow\n");
7218         return -1;
7219     }
7220     sps->mb_width = mb_width;
7221     sps->mb_height= mb_height;
7222
7223     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7224     if(!sps->frame_mbs_only_flag)
7225         sps->mb_aff= get_bits1(&s->gb);
7226     else
7227         sps->mb_aff= 0;
7228
7229     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7230
7231 #ifndef ALLOW_INTERLACE
7232     if(sps->mb_aff)
7233         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it at compile-time.\n");
7234 #endif
7235     if(!sps->direct_8x8_inference_flag && sps->mb_aff)
7236         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + !direct_8x8_inference is not implemented\n");
7237
7238     sps->crop= get_bits1(&s->gb);
7239     if(sps->crop){
7240         sps->crop_left  = get_ue_golomb(&s->gb);
7241         sps->crop_right = get_ue_golomb(&s->gb);
7242         sps->crop_top   = get_ue_golomb(&s->gb);
7243         sps->crop_bottom= get_ue_golomb(&s->gb);
7244         if(sps->crop_left || sps->crop_top){
7245             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7246         }
7247     }else{
7248         sps->crop_left  =
7249         sps->crop_right =
7250         sps->crop_top   =
7251         sps->crop_bottom= 0;
7252     }
7253
7254     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7255     if( sps->vui_parameters_present_flag )
7256         decode_vui_parameters(h, sps);
7257
7258     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7259         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%u profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s\n",
7260                sps_id, sps->profile_idc, sps->level_idc,
7261                sps->poc_type,
7262                sps->ref_frame_count,
7263                sps->mb_width, sps->mb_height,
7264                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7265                sps->direct_8x8_inference_flag ? "8B8" : "",
7266                sps->crop_left, sps->crop_right,
7267                sps->crop_top, sps->crop_bottom,
7268                sps->vui_parameters_present_flag ? "VUI" : ""
7269                );
7270     }
7271     return 0;
7272 }
7273
7274 static void
7275 build_qp_table(PPS *pps, int t, int index)
7276 {
7277     int i;
7278     for(i = 0; i < 255; i++)
7279         pps->chroma_qp_table[t][i & 0xff] = chroma_qp[av_clip(i + index, 0, 51)];
7280 }
7281
7282 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7283     MpegEncContext * const s = &h->s;
7284     unsigned int tmp, pps_id= get_ue_golomb(&s->gb);
7285     PPS *pps;
7286
7287     pps = alloc_parameter_set(h, (void **)h->pps_buffers, pps_id, MAX_PPS_COUNT, sizeof(PPS), "pps");
7288     if(pps == NULL)
7289         return -1;
7290
7291     tmp= get_ue_golomb(&s->gb);
7292     if(tmp>=MAX_SPS_COUNT || h->sps_buffers[tmp] == NULL){
7293         av_log(h->s.avctx, AV_LOG_ERROR, "sps_id out of range\n");
7294         return -1;
7295     }
7296     pps->sps_id= tmp;
7297
7298     pps->cabac= get_bits1(&s->gb);
7299     pps->pic_order_present= get_bits1(&s->gb);
7300     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7301     if(pps->slice_group_count > 1 ){
7302         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7303         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7304         switch(pps->mb_slice_group_map_type){
7305         case 0:
7306 #if 0
7307 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7308 |    run_length[ i ]                                |1  |ue(v)   |
7309 #endif
7310             break;
7311         case 2:
7312 #if 0
7313 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7314 |{                                                  |   |        |
7315 |    top_left_mb[ i ]                               |1  |ue(v)   |
7316 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7317 |   }                                               |   |        |
7318 #endif
7319             break;
7320         case 3:
7321         case 4:
7322         case 5:
7323 #if 0
7324 |   slice_group_change_direction_flag               |1  |u(1)    |
7325 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7326 #endif
7327             break;
7328         case 6:
7329 #if 0
7330 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7331 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7332 |)                                                  |   |        |
7333 |    slice_group_id[ i ]                            |1  |u(v)    |
7334 #endif
7335             break;
7336         }
7337     }
7338     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7339     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7340     if(pps->ref_count[0]-1 > 32-1 || pps->ref_count[1]-1 > 32-1){
7341         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7342         pps->ref_count[0]= pps->ref_count[1]= 1;
7343         return -1;
7344     }
7345
7346     pps->weighted_pred= get_bits1(&s->gb);
7347     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7348     pps->init_qp= get_se_golomb(&s->gb) + 26;
7349     pps->init_qs= get_se_golomb(&s->gb) + 26;
7350     pps->chroma_qp_index_offset[0]= get_se_golomb(&s->gb);
7351     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7352     pps->constrained_intra_pred= get_bits1(&s->gb);
7353     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7354
7355     pps->transform_8x8_mode= 0;
7356     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7357     memset(pps->scaling_matrix4, 16, 6*16*sizeof(uint8_t));
7358     memset(pps->scaling_matrix8, 16, 2*64*sizeof(uint8_t));
7359
7360     if(get_bits_count(&s->gb) < bit_length){
7361         pps->transform_8x8_mode= get_bits1(&s->gb);
7362         decode_scaling_matrices(h, h->sps_buffers[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7363         pps->chroma_qp_index_offset[1]= get_se_golomb(&s->gb); //second_chroma_qp_index_offset
7364     } else {
7365         pps->chroma_qp_index_offset[1]= pps->chroma_qp_index_offset[0];
7366     }
7367
7368     build_qp_table(pps, 0, pps->chroma_qp_index_offset[0]);
7369     if(pps->chroma_qp_index_offset[0] != pps->chroma_qp_index_offset[1]) {
7370         build_qp_table(pps, 1, pps->chroma_qp_index_offset[1]);
7371         h->pps.chroma_qp_diff= 1;
7372     } else
7373         memcpy(pps->chroma_qp_table[1], pps->chroma_qp_table[0], 256);
7374
7375     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7376         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%u sps:%u %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d/%d %s %s %s %s\n",
7377                pps_id, pps->sps_id,
7378                pps->cabac ? "CABAC" : "CAVLC",
7379                pps->slice_group_count,
7380                pps->ref_count[0], pps->ref_count[1],
7381                pps->weighted_pred ? "weighted" : "",
7382                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset[0], pps->chroma_qp_index_offset[1],
7383                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7384                pps->constrained_intra_pred ? "CONSTR" : "",
7385                pps->redundant_pic_cnt_present ? "REDU" : "",
7386                pps->transform_8x8_mode ? "8x8DCT" : ""
7387                );
7388     }
7389
7390     return 0;
7391 }
7392
7393 /**
7394  * Call decode_slice() for each context.
7395  *
7396  * @param h h264 master context
7397  * @param context_count number of contexts to execute
7398  */
7399 static void execute_decode_slices(H264Context *h, int context_count){
7400     MpegEncContext * const s = &h->s;
7401     AVCodecContext * const avctx= s->avctx;
7402     H264Context *hx;
7403     int i;
7404
7405     if(context_count == 1) {
7406         decode_slice(avctx, h);
7407     } else {
7408         for(i = 1; i < context_count; i++) {
7409             hx = h->thread_context[i];
7410             hx->s.error_resilience = avctx->error_resilience;
7411             hx->s.error_count = 0;
7412         }
7413
7414         avctx->execute(avctx, (void *)decode_slice,
7415                        (void **)h->thread_context, NULL, context_count);
7416
7417         /* pull back stuff from slices to master context */
7418         hx = h->thread_context[context_count - 1];
7419         s->mb_x = hx->s.mb_x;
7420         s->mb_y = hx->s.mb_y;
7421         s->dropable = hx->s.dropable;
7422         s->picture_structure = hx->s.picture_structure;
7423         for(i = 1; i < context_count; i++)
7424             h->s.error_count += h->thread_context[i]->s.error_count;
7425     }
7426 }
7427
7428
7429 static int decode_nal_units(H264Context *h, const uint8_t *buf, int buf_size){
7430     MpegEncContext * const s = &h->s;
7431     AVCodecContext * const avctx= s->avctx;
7432     int buf_index=0;
7433     H264Context *hx; ///< thread context
7434     int context_count = 0;
7435
7436     h->max_contexts = avctx->thread_count;
7437 #if 0
7438     int i;
7439     for(i=0; i<50; i++){
7440         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7441     }
7442 #endif
7443     if(!(s->flags2 & CODEC_FLAG2_CHUNKS)){
7444         h->current_slice = 0;
7445         if (!s->first_field)
7446             s->current_picture_ptr= NULL;
7447     }
7448
7449     for(;;){
7450         int consumed;
7451         int dst_length;
7452         int bit_length;
7453         const uint8_t *ptr;
7454         int i, nalsize = 0;
7455         int err;
7456
7457         if(h->is_avc) {
7458             if(buf_index >= buf_size) break;
7459             nalsize = 0;
7460             for(i = 0; i < h->nal_length_size; i++)
7461                 nalsize = (nalsize << 8) | buf[buf_index++];
7462             if(nalsize <= 1 || (nalsize+buf_index > buf_size)){
7463                 if(nalsize == 1){
7464                     buf_index++;
7465                     continue;
7466                 }else{
7467                     av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7468                     break;
7469                 }
7470             }
7471         } else {
7472             // start code prefix search
7473             for(; buf_index + 3 < buf_size; buf_index++){
7474                 // This should always succeed in the first iteration.
7475                 if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7476                     break;
7477             }
7478
7479             if(buf_index+3 >= buf_size) break;
7480
7481             buf_index+=3;
7482         }
7483
7484         hx = h->thread_context[context_count];
7485
7486         ptr= decode_nal(hx, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7487         if (ptr==NULL || dst_length < 0){
7488             return -1;
7489         }
7490         while(ptr[dst_length - 1] == 0 && dst_length > 0)
7491             dst_length--;
7492         bit_length= !dst_length ? 0 : (8*dst_length - decode_rbsp_trailing(h, ptr + dst_length - 1));
7493
7494         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7495             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", hx->nal_unit_type, buf_index, buf_size, dst_length);
7496         }
7497
7498         if (h->is_avc && (nalsize != consumed))
7499             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7500
7501         buf_index += consumed;
7502
7503         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME do not discard SEI id
7504            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7505             continue;
7506
7507       again:
7508         err = 0;
7509         switch(hx->nal_unit_type){
7510         case NAL_IDR_SLICE:
7511             if (h->nal_unit_type != NAL_IDR_SLICE) {
7512                 av_log(h->s.avctx, AV_LOG_ERROR, "Invalid mix of idr and non-idr slices");
7513                 return -1;
7514             }
7515             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7516         case NAL_SLICE:
7517             init_get_bits(&hx->s.gb, ptr, bit_length);
7518             hx->intra_gb_ptr=
7519             hx->inter_gb_ptr= &hx->s.gb;
7520             hx->s.data_partitioning = 0;
7521
7522             if((err = decode_slice_header(hx, h)))
7523                break;
7524
7525             s->current_picture_ptr->key_frame|= (hx->nal_unit_type == NAL_IDR_SLICE);
7526             if(hx->redundant_pic_count==0 && hx->s.hurry_up < 5
7527                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7528                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type!=FF_B_TYPE)
7529                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type==FF_I_TYPE)
7530                && avctx->skip_frame < AVDISCARD_ALL)
7531                 context_count++;
7532             break;
7533         case NAL_DPA:
7534             init_get_bits(&hx->s.gb, ptr, bit_length);
7535             hx->intra_gb_ptr=
7536             hx->inter_gb_ptr= NULL;
7537             hx->s.data_partitioning = 1;
7538
7539             err = decode_slice_header(hx, h);
7540             break;
7541         case NAL_DPB:
7542             init_get_bits(&hx->intra_gb, ptr, bit_length);
7543             hx->intra_gb_ptr= &hx->intra_gb;
7544             break;
7545         case NAL_DPC:
7546             init_get_bits(&hx->inter_gb, ptr, bit_length);
7547             hx->inter_gb_ptr= &hx->inter_gb;
7548
7549             if(hx->redundant_pic_count==0 && hx->intra_gb_ptr && hx->s.data_partitioning
7550                && s->context_initialized
7551                && s->hurry_up < 5
7552                && (avctx->skip_frame < AVDISCARD_NONREF || hx->nal_ref_idc)
7553                && (avctx->skip_frame < AVDISCARD_BIDIR  || hx->slice_type!=FF_B_TYPE)
7554                && (avctx->skip_frame < AVDISCARD_NONKEY || hx->slice_type==FF_I_TYPE)
7555                && avctx->skip_frame < AVDISCARD_ALL)
7556                 context_count++;
7557             break;
7558         case NAL_SEI:
7559             init_get_bits(&s->gb, ptr, bit_length);
7560             decode_sei(h);
7561             break;
7562         case NAL_SPS:
7563             init_get_bits(&s->gb, ptr, bit_length);
7564             decode_seq_parameter_set(h);
7565
7566             if(s->flags& CODEC_FLAG_LOW_DELAY)
7567                 s->low_delay=1;
7568
7569             if(avctx->has_b_frames < 2)
7570                 avctx->has_b_frames= !s->low_delay;
7571             break;
7572         case NAL_PPS:
7573             init_get_bits(&s->gb, ptr, bit_length);
7574
7575             decode_picture_parameter_set(h, bit_length);
7576
7577             break;
7578         case NAL_AUD:
7579         case NAL_END_SEQUENCE:
7580         case NAL_END_STREAM:
7581         case NAL_FILLER_DATA:
7582         case NAL_SPS_EXT:
7583         case NAL_AUXILIARY_SLICE:
7584             break;
7585         default:
7586             av_log(avctx, AV_LOG_DEBUG, "Unknown NAL code: %d (%d bits)\n", h->nal_unit_type, bit_length);
7587         }
7588
7589         if(context_count == h->max_contexts) {
7590             execute_decode_slices(h, context_count);
7591             context_count = 0;
7592         }
7593
7594         if (err < 0)
7595             av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7596         else if(err == 1) {
7597             /* Slice could not be decoded in parallel mode, copy down
7598              * NAL unit stuff to context 0 and restart. Note that
7599              * rbsp_buffer is not transfered, but since we no longer
7600              * run in parallel mode this should not be an issue. */
7601             h->nal_unit_type = hx->nal_unit_type;
7602             h->nal_ref_idc   = hx->nal_ref_idc;
7603             hx = h;
7604             goto again;
7605         }
7606     }
7607     if(context_count)
7608         execute_decode_slices(h, context_count);
7609     return buf_index;
7610 }
7611
7612 /**
7613  * returns the number of bytes consumed for building the current frame
7614  */
7615 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7616     if(s->flags&CODEC_FLAG_TRUNCATED){
7617         pos -= s->parse_context.last_index;
7618         if(pos<0) pos=0; // FIXME remove (unneeded?)
7619
7620         return pos;
7621     }else{
7622         if(pos==0) pos=1; //avoid infinite loops (i doubt that is needed but ...)
7623         if(pos+10>buf_size) pos=buf_size; // oops ;)
7624
7625         return pos;
7626     }
7627 }
7628
7629 static int decode_frame(AVCodecContext *avctx,
7630                              void *data, int *data_size,
7631                              const uint8_t *buf, int buf_size)
7632 {
7633     H264Context *h = avctx->priv_data;
7634     MpegEncContext *s = &h->s;
7635     AVFrame *pict = data;
7636     int buf_index;
7637
7638     s->flags= avctx->flags;
7639     s->flags2= avctx->flags2;
7640
7641    /* no supplementary picture */
7642     if (buf_size == 0) {
7643         Picture *out;
7644         int i, out_idx;
7645
7646 //FIXME factorize this with the output code below
7647         out = h->delayed_pic[0];
7648         out_idx = 0;
7649         for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
7650             if(h->delayed_pic[i]->poc < out->poc){
7651                 out = h->delayed_pic[i];
7652                 out_idx = i;
7653             }
7654
7655         for(i=out_idx; h->delayed_pic[i]; i++)
7656             h->delayed_pic[i] = h->delayed_pic[i+1];
7657
7658         if(out){
7659             *data_size = sizeof(AVFrame);
7660             *pict= *(AVFrame*)out;
7661         }
7662
7663         return 0;
7664     }
7665
7666     if(s->flags&CODEC_FLAG_TRUNCATED){
7667         int next= ff_h264_find_frame_end(h, buf, buf_size);
7668
7669         if( ff_combine_frame(&s->parse_context, next, (const uint8_t **)&buf, &buf_size) < 0 )
7670             return buf_size;
7671 //printf("next:%d buf_size:%d last_index:%d\n", next, buf_size, s->parse_context.last_index);
7672     }
7673
7674     if(h->is_avc && !h->got_avcC) {
7675         int i, cnt, nalsize;
7676         unsigned char *p = avctx->extradata;
7677         if(avctx->extradata_size < 7) {
7678             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7679             return -1;
7680         }
7681         if(*p != 1) {
7682             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7683             return -1;
7684         }
7685         /* sps and pps in the avcC always have length coded with 2 bytes,
7686            so put a fake nal_length_size = 2 while parsing them */
7687         h->nal_length_size = 2;
7688         // Decode sps from avcC
7689         cnt = *(p+5) & 0x1f; // Number of sps
7690         p += 6;
7691         for (i = 0; i < cnt; i++) {
7692             nalsize = AV_RB16(p) + 2;
7693             if(decode_nal_units(h, p, nalsize) < 0) {
7694                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7695                 return -1;
7696             }
7697             p += nalsize;
7698         }
7699         // Decode pps from avcC
7700         cnt = *(p++); // Number of pps
7701         for (i = 0; i < cnt; i++) {
7702             nalsize = AV_RB16(p) + 2;
7703             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7704                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7705                 return -1;
7706             }
7707             p += nalsize;
7708         }
7709         // Now store right nal length size, that will be use to parse all other nals
7710         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7711         // Do not reparse avcC
7712         h->got_avcC = 1;
7713     }
7714
7715     if(avctx->frame_number==0 && !h->is_avc && s->avctx->extradata_size){
7716         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7717             return -1;
7718     }
7719
7720     buf_index=decode_nal_units(h, buf, buf_size);
7721     if(buf_index < 0)
7722         return -1;
7723
7724     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) && !s->current_picture_ptr){
7725         if (avctx->skip_frame >= AVDISCARD_NONREF || s->hurry_up) return 0;
7726         av_log(avctx, AV_LOG_ERROR, "no frame!\n");
7727         return -1;
7728     }
7729
7730     if(!(s->flags2 & CODEC_FLAG2_CHUNKS) || (s->mb_y >= s->mb_height && s->mb_height)){
7731         Picture *out = s->current_picture_ptr;
7732         Picture *cur = s->current_picture_ptr;
7733         Picture *prev = h->delayed_output_pic;
7734         int i, pics, cross_idr, out_of_order, out_idx;
7735
7736         s->mb_y= 0;
7737
7738         s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7739         s->current_picture_ptr->pict_type= s->pict_type;
7740
7741         h->prev_frame_num_offset= h->frame_num_offset;
7742         h->prev_frame_num= h->frame_num;
7743         if(!s->dropable) {
7744             h->prev_poc_msb= h->poc_msb;
7745             h->prev_poc_lsb= h->poc_lsb;
7746             execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7747         }
7748
7749         /*
7750          * FIXME: Error handling code does not seem to support interlaced
7751          * when slices span multiple rows
7752          * The ff_er_add_slice calls don't work right for bottom
7753          * fields; they cause massive erroneous error concealing
7754          * Error marking covers both fields (top and bottom).
7755          * This causes a mismatched s->error_count
7756          * and a bad error table. Further, the error count goes to
7757          * INT_MAX when called for bottom field, because mb_y is
7758          * past end by one (callers fault) and resync_mb_y != 0
7759          * causes problems for the first MB line, too.
7760          */
7761         if (!FIELD_PICTURE)
7762             ff_er_frame_end(s);
7763
7764         MPV_frame_end(s);
7765
7766         if (s->first_field) {
7767             /* Wait for second field. */
7768             *data_size = 0;
7769
7770         } else {
7771             cur->interlaced_frame = FIELD_OR_MBAFF_PICTURE;
7772             /* Derive top_field_first from field pocs. */
7773             cur->top_field_first = cur->field_poc[0] < cur->field_poc[1];
7774
7775         //FIXME do something with unavailable reference frames
7776
7777 #if 0 //decode order
7778             *data_size = sizeof(AVFrame);
7779 #else
7780             /* Sort B-frames into display order */
7781
7782             if(h->sps.bitstream_restriction_flag
7783                && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7784                 s->avctx->has_b_frames = h->sps.num_reorder_frames;
7785                 s->low_delay = 0;
7786             }
7787
7788             pics = 0;
7789             while(h->delayed_pic[pics]) pics++;
7790
7791             assert(pics+1 < sizeof(h->delayed_pic) / sizeof(h->delayed_pic[0]));
7792
7793             h->delayed_pic[pics++] = cur;
7794             if(cur->reference == 0)
7795                 cur->reference = DELAYED_PIC_REF;
7796
7797             cross_idr = 0;
7798             for(i=0; h->delayed_pic[i]; i++)
7799                 if(h->delayed_pic[i]->key_frame || h->delayed_pic[i]->poc==0)
7800                     cross_idr = 1;
7801
7802             out = h->delayed_pic[0];
7803             out_idx = 0;
7804             for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
7805                 if(h->delayed_pic[i]->poc < out->poc){
7806                     out = h->delayed_pic[i];
7807                     out_idx = i;
7808                 }
7809
7810             out_of_order = !cross_idr && prev && out->poc < prev->poc;
7811             if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7812                 { }
7813             else if(prev && pics <= s->avctx->has_b_frames)
7814                 out = prev;
7815             else if((out_of_order && pics-1 == s->avctx->has_b_frames && pics < 15)
7816                || (s->low_delay &&
7817                 ((!cross_idr && prev && out->poc > prev->poc + 2)
7818                  || cur->pict_type == FF_B_TYPE)))
7819             {
7820                 s->low_delay = 0;
7821                 s->avctx->has_b_frames++;
7822                 out = prev;
7823             }
7824             else if(out_of_order)
7825                 out = prev;
7826
7827             if(out_of_order || pics > s->avctx->has_b_frames){
7828                 for(i=out_idx; h->delayed_pic[i]; i++)
7829                     h->delayed_pic[i] = h->delayed_pic[i+1];
7830             }
7831
7832             if(prev == out)
7833                 *data_size = 0;
7834             else
7835                 *data_size = sizeof(AVFrame);
7836             if(prev && prev != out && prev->reference == DELAYED_PIC_REF)
7837                 prev->reference = 0;
7838             h->delayed_output_pic = out;
7839 #endif
7840
7841             if(out)
7842                 *pict= *(AVFrame*)out;
7843             else
7844                 av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7845         }
7846     }
7847
7848     assert(pict->data[0] || !*data_size);
7849     ff_print_debug_info(s, pict);
7850 //printf("out %d\n", (int)pict->data[0]);
7851 #if 0 //?
7852
7853     /* Return the Picture timestamp as the frame number */
7854     /* we subtract 1 because it is added on utils.c     */
7855     avctx->frame_number = s->picture_number - 1;
7856 #endif
7857     return get_consumed_bytes(s, buf_index, buf_size);
7858 }
7859 #if 0
7860 static inline void fill_mb_avail(H264Context *h){
7861     MpegEncContext * const s = &h->s;
7862     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7863
7864     if(s->mb_y){
7865         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7866         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7867         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7868     }else{
7869         h->mb_avail[0]=
7870         h->mb_avail[1]=
7871         h->mb_avail[2]= 0;
7872     }
7873     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7874     h->mb_avail[4]= 1; //FIXME move out
7875     h->mb_avail[5]= 0; //FIXME move out
7876 }
7877 #endif
7878
7879 #ifdef TEST
7880 #undef printf
7881 #undef random
7882 #define COUNT 8000
7883 #define SIZE (COUNT*40)
7884 int main(void){
7885     int i;
7886     uint8_t temp[SIZE];
7887     PutBitContext pb;
7888     GetBitContext gb;
7889 //    int int_temp[10000];
7890     DSPContext dsp;
7891     AVCodecContext avctx;
7892
7893     dsputil_init(&dsp, &avctx);
7894
7895     init_put_bits(&pb, temp, SIZE);
7896     printf("testing unsigned exp golomb\n");
7897     for(i=0; i<COUNT; i++){
7898         START_TIMER
7899         set_ue_golomb(&pb, i);
7900         STOP_TIMER("set_ue_golomb");
7901     }
7902     flush_put_bits(&pb);
7903
7904     init_get_bits(&gb, temp, 8*SIZE);
7905     for(i=0; i<COUNT; i++){
7906         int j, s;
7907
7908         s= show_bits(&gb, 24);
7909
7910         START_TIMER
7911         j= get_ue_golomb(&gb);
7912         if(j != i){
7913             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7914 //            return -1;
7915         }
7916         STOP_TIMER("get_ue_golomb");
7917     }
7918
7919
7920     init_put_bits(&pb, temp, SIZE);
7921     printf("testing signed exp golomb\n");
7922     for(i=0; i<COUNT; i++){
7923         START_TIMER
7924         set_se_golomb(&pb, i - COUNT/2);
7925         STOP_TIMER("set_se_golomb");
7926     }
7927     flush_put_bits(&pb);
7928
7929     init_get_bits(&gb, temp, 8*SIZE);
7930     for(i=0; i<COUNT; i++){
7931         int j, s;
7932
7933         s= show_bits(&gb, 24);
7934
7935         START_TIMER
7936         j= get_se_golomb(&gb);
7937         if(j != i - COUNT/2){
7938             printf("mismatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7939 //            return -1;
7940         }
7941         STOP_TIMER("get_se_golomb");
7942     }
7943
7944 #if 0
7945     printf("testing 4x4 (I)DCT\n");
7946
7947     DCTELEM block[16];
7948     uint8_t src[16], ref[16];
7949     uint64_t error= 0, max_error=0;
7950
7951     for(i=0; i<COUNT; i++){
7952         int j;
7953 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
7954         for(j=0; j<16; j++){
7955             ref[j]= random()%255;
7956             src[j]= random()%255;
7957         }
7958
7959         h264_diff_dct_c(block, src, ref, 4);
7960
7961         //normalize
7962         for(j=0; j<16; j++){
7963 //            printf("%d ", block[j]);
7964             block[j]= block[j]*4;
7965             if(j&1) block[j]= (block[j]*4 + 2)/5;
7966             if(j&4) block[j]= (block[j]*4 + 2)/5;
7967         }
7968 //        printf("\n");
7969
7970         s->dsp.h264_idct_add(ref, block, 4);
7971 /*        for(j=0; j<16; j++){
7972             printf("%d ", ref[j]);
7973         }
7974         printf("\n");*/
7975
7976         for(j=0; j<16; j++){
7977             int diff= FFABS(src[j] - ref[j]);
7978
7979             error+= diff*diff;
7980             max_error= FFMAX(max_error, diff);
7981         }
7982     }
7983     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
7984     printf("testing quantizer\n");
7985     for(qp=0; qp<52; qp++){
7986         for(i=0; i<16; i++)
7987             src1_block[i]= src2_block[i]= random()%255;
7988
7989     }
7990     printf("Testing NAL layer\n");
7991
7992     uint8_t bitstream[COUNT];
7993     uint8_t nal[COUNT*2];
7994     H264Context h;
7995     memset(&h, 0, sizeof(H264Context));
7996
7997     for(i=0; i<COUNT; i++){
7998         int zeros= i;
7999         int nal_length;
8000         int consumed;
8001         int out_length;
8002         uint8_t *out;
8003         int j;
8004
8005         for(j=0; j<COUNT; j++){
8006             bitstream[j]= (random() % 255) + 1;
8007         }
8008
8009         for(j=0; j<zeros; j++){
8010             int pos= random() % COUNT;
8011             while(bitstream[pos] == 0){
8012                 pos++;
8013                 pos %= COUNT;
8014             }
8015             bitstream[pos]=0;
8016         }
8017
8018         START_TIMER
8019
8020         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
8021         if(nal_length<0){
8022             printf("encoding failed\n");
8023             return -1;
8024         }
8025
8026         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
8027
8028         STOP_TIMER("NAL")
8029
8030         if(out_length != COUNT){
8031             printf("incorrect length %d %d\n", out_length, COUNT);
8032             return -1;
8033         }
8034
8035         if(consumed != nal_length){
8036             printf("incorrect consumed length %d %d\n", nal_length, consumed);
8037             return -1;
8038         }
8039
8040         if(memcmp(bitstream, out, COUNT)){
8041             printf("mismatch\n");
8042             return -1;
8043         }
8044     }
8045 #endif
8046
8047     printf("Testing RBSP\n");
8048
8049
8050     return 0;
8051 }
8052 #endif /* TEST */
8053
8054
8055 static int decode_end(AVCodecContext *avctx)
8056 {
8057     H264Context *h = avctx->priv_data;
8058     MpegEncContext *s = &h->s;
8059
8060     av_freep(&h->rbsp_buffer[0]);
8061     av_freep(&h->rbsp_buffer[1]);
8062     free_tables(h); //FIXME cleanup init stuff perhaps
8063     MPV_common_end(s);
8064
8065 //    memset(h, 0, sizeof(H264Context));
8066
8067     return 0;
8068 }
8069
8070
8071 AVCodec h264_decoder = {
8072     "h264",
8073     CODEC_TYPE_VIDEO,
8074     CODEC_ID_H264,
8075     sizeof(H264Context),
8076     decode_init,
8077     NULL,
8078     decode_end,
8079     decode_frame,
8080     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_TRUNCATED | CODEC_CAP_DELAY,
8081     .flush= flush_dpb,
8082 };
8083
8084 #include "svq3.c"