libavcodec/snow.c

   1 /*
   2  * Copyright (C) 2004 Michael Niedermayer <michaelni@gmx.at>
   3  *
   4  * This file is part of FFmpeg.
   5  *
   6  * FFmpeg is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * FFmpeg is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with FFmpeg; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 #include "avcodec.h"
  22 #include "dsputil.h"
  23 #include "snow.h"
  24
  25 #include "rangecoder.h"
  26
  27 #include "mpegvideo.h"
  28
  29 #undef NDEBUG
  30 #include <assert.h>
  31
  32 static const int8_t quant3[256]={
  33  0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  34  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  35  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  36  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  37  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  38  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  39  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  40  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  41 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  42 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  43 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  44 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  45 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  46 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  47 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  48 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0,
  49 };
  50 static const int8_t quant3b[256]={
  51  0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  52  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  53  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  54  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  55  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  56  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  57  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  58  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  59 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  60 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  61 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  62 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  63 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  64 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  65 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  66 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  67 };
  68 static const int8_t quant3bA[256]={
  69  0, 0, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  70  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  71  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  72  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  73  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  74  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  75  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  76  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  77  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  78  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  79  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  80  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  81  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  82  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  83  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  84  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  85 };
  86 static const int8_t quant5[256]={
  87  0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  88  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  89  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  90  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  91  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  92  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  93  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  94  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  95 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
  96 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
  97 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
  98 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
  99 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
 100 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
 101 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
 102 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-1,-1,-1,
 103 };
 104 static const int8_t quant7[256]={
 105  0, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 106  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 107  2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
 108  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 109  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 110  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 111  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 112  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 113 -3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
 114 -3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
 115 -3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
 116 -3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
 117 -3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
 118 -3,-3,-3,-3,-3,-3,-3,-3,-3,-2,-2,-2,-2,-2,-2,-2,
 119 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
 120 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-1,-1,
 121 };
 122 static const int8_t quant9[256]={
 123  0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 124  3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 125  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 126  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 127  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 128  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 129  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 130  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 131 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 132 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 133 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 134 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 135 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 136 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 137 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-3,-3,-3,-3,
 138 -3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-2,-2,-2,-2,-1,-1,
 139 };
 140 static const int8_t quant11[256]={
 141  0, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4,
 142  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 143  4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 144  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 145  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 146  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 147  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 148  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 149 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 150 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 151 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 152 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 153 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 154 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-4,-4,
 155 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 156 -4,-4,-4,-4,-4,-3,-3,-3,-3,-3,-3,-3,-2,-2,-2,-1,
 157 };
 158 static const int8_t quant13[256]={
 159  0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
 160  4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 161  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 162  5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
 163  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
 164  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
 165  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
 166  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
 167 -6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
 168 -6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
 169 -6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
 170 -6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
 171 -6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-5,
 172 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 173 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 174 -4,-4,-4,-4,-4,-4,-4,-4,-4,-3,-3,-3,-3,-2,-2,-1,
 175 };
 176
 177 #if 0 //64*cubic
 178 static const uint8_t obmc32[1024]={
 179   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 180   0,  0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  8,  8,  8,  8,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,  0,
 181   0,  0,  0,  4,  4,  4,  4,  8,  8, 12, 12, 12, 16, 16, 16, 16, 16, 16, 16, 16, 12, 12, 12,  8,  8,  4,  4,  4,  4,  0,  0,  0,
 182   0,  0,  4,  4,  8,  8, 12, 16, 16, 20, 24, 24, 28, 28, 32, 32, 32, 32, 28, 28, 24, 24, 20, 16, 16, 12,  8,  8,  4,  4,  0,  0,
 183   0,  0,  4,  8,  8, 12, 16, 24, 28, 32, 36, 40, 44, 48, 48, 48, 48, 48, 48, 44, 40, 36, 32, 28, 24, 16, 12,  8,  8,  4,  0,  0,
 184   0,  4,  4,  8, 12, 20, 24, 32, 40, 44, 52, 56, 60, 64, 68, 72, 72, 68, 64, 60, 56, 52, 44, 40, 32, 24, 20, 12,  8,  4,  4,  0,
 185   0,  4,  4, 12, 16, 24, 32, 40, 52, 60, 68, 76, 80, 88, 88, 92, 92, 88, 88, 80, 76, 68, 60, 52, 40, 32, 24, 16, 12,  4,  4,  0,
 186   0,  4,  8, 16, 24, 32, 40, 52, 64, 76, 84, 92,100,108,112,116,116,112,108,100, 92, 84, 76, 64, 52, 40, 32, 24, 16,  8,  4,  0,
 187   0,  4,  8, 16, 28, 40, 52, 64, 76, 88,100,112,124,132,136,140,140,136,132,124,112,100, 88, 76, 64, 52, 40, 28, 16,  8,  4,  0,
 188   0,  4, 12, 20, 32, 44, 60, 76, 88,104,120,132,144,152,160,164,164,160,152,144,132,120,104, 88, 76, 60, 44, 32, 20, 12,  4,  0,
 189   0,  4, 12, 24, 36, 48, 68, 84,100,120,136,152,164,176,180,184,184,180,176,164,152,136,120,100, 84, 68, 48, 36, 24, 12,  4,  0,
 190   0,  4, 12, 24, 40, 56, 76, 92,112,132,152,168,180,192,204,208,208,204,192,180,168,152,132,112, 92, 76, 56, 40, 24, 12,  4,  0,
 191   0,  4, 16, 28, 44, 60, 80,100,124,144,164,180,196,208,220,224,224,220,208,196,180,164,144,124,100, 80, 60, 44, 28, 16,  4,  0,
 192   0,  8, 16, 28, 48, 64, 88,108,132,152,176,192,208,224,232,240,240,232,224,208,192,176,152,132,108, 88, 64, 48, 28, 16,  8,  0,
 193   0,  4, 16, 32, 48, 68, 88,112,136,160,180,204,220,232,244,248,248,244,232,220,204,180,160,136,112, 88, 68, 48, 32, 16,  4,  0,
 194   1,  8, 16, 32, 48, 72, 92,116,140,164,184,208,224,240,248,255,255,248,240,224,208,184,164,140,116, 92, 72, 48, 32, 16,  8,  1,
 195   1,  8, 16, 32, 48, 72, 92,116,140,164,184,208,224,240,248,255,255,248,240,224,208,184,164,140,116, 92, 72, 48, 32, 16,  8,  1,
 196   0,  4, 16, 32, 48, 68, 88,112,136,160,180,204,220,232,244,248,248,244,232,220,204,180,160,136,112, 88, 68, 48, 32, 16,  4,  0,
 197   0,  8, 16, 28, 48, 64, 88,108,132,152,176,192,208,224,232,240,240,232,224,208,192,176,152,132,108, 88, 64, 48, 28, 16,  8,  0,
 198   0,  4, 16, 28, 44, 60, 80,100,124,144,164,180,196,208,220,224,224,220,208,196,180,164,144,124,100, 80, 60, 44, 28, 16,  4,  0,
 199   0,  4, 12, 24, 40, 56, 76, 92,112,132,152,168,180,192,204,208,208,204,192,180,168,152,132,112, 92, 76, 56, 40, 24, 12,  4,  0,
 200   0,  4, 12, 24, 36, 48, 68, 84,100,120,136,152,164,176,180,184,184,180,176,164,152,136,120,100, 84, 68, 48, 36, 24, 12,  4,  0,
 201   0,  4, 12, 20, 32, 44, 60, 76, 88,104,120,132,144,152,160,164,164,160,152,144,132,120,104, 88, 76, 60, 44, 32, 20, 12,  4,  0,
 202   0,  4,  8, 16, 28, 40, 52, 64, 76, 88,100,112,124,132,136,140,140,136,132,124,112,100, 88, 76, 64, 52, 40, 28, 16,  8,  4,  0,
 203   0,  4,  8, 16, 24, 32, 40, 52, 64, 76, 84, 92,100,108,112,116,116,112,108,100, 92, 84, 76, 64, 52, 40, 32, 24, 16,  8,  4,  0,
 204   0,  4,  4, 12, 16, 24, 32, 40, 52, 60, 68, 76, 80, 88, 88, 92, 92, 88, 88, 80, 76, 68, 60, 52, 40, 32, 24, 16, 12,  4,  4,  0,
 205   0,  4,  4,  8, 12, 20, 24, 32, 40, 44, 52, 56, 60, 64, 68, 72, 72, 68, 64, 60, 56, 52, 44, 40, 32, 24, 20, 12,  8,  4,  4,  0,
 206   0,  0,  4,  8,  8, 12, 16, 24, 28, 32, 36, 40, 44, 48, 48, 48, 48, 48, 48, 44, 40, 36, 32, 28, 24, 16, 12,  8,  8,  4,  0,  0,
 207   0,  0,  4,  4,  8,  8, 12, 16, 16, 20, 24, 24, 28, 28, 32, 32, 32, 32, 28, 28, 24, 24, 20, 16, 16, 12,  8,  8,  4,  4,  0,  0,
 208   0,  0,  0,  4,  4,  4,  4,  8,  8, 12, 12, 12, 16, 16, 16, 16, 16, 16, 16, 16, 12, 12, 12,  8,  8,  4,  4,  4,  4,  0,  0,  0,
 209   0,  0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  8,  8,  8,  8,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,  0,
 210   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 211 //error:0.000022
 212 };
 213 static const uint8_t obmc16[256]={
 214   0,  0,  0,  0,  0,  0,  4,  4,  4,  4,  0,  0,  0,  0,  0,  0,
 215   0,  4,  4,  8, 16, 20, 20, 24, 24, 20, 20, 16,  8,  4,  4,  0,
 216   0,  4, 16, 24, 36, 44, 52, 60, 60, 52, 44, 36, 24, 16,  4,  0,
 217   0,  8, 24, 44, 60, 80, 96,104,104, 96, 80, 60, 44, 24,  8,  0,
 218   0, 16, 36, 60, 92,116,136,152,152,136,116, 92, 60, 36, 16,  0,
 219   0, 20, 44, 80,116,152,180,196,196,180,152,116, 80, 44, 20,  0,
 220   4, 20, 52, 96,136,180,212,228,228,212,180,136, 96, 52, 20,  4,
 221   4, 24, 60,104,152,196,228,248,248,228,196,152,104, 60, 24,  4,
 222   4, 24, 60,104,152,196,228,248,248,228,196,152,104, 60, 24,  4,
 223   4, 20, 52, 96,136,180,212,228,228,212,180,136, 96, 52, 20,  4,
 224   0, 20, 44, 80,116,152,180,196,196,180,152,116, 80, 44, 20,  0,
 225   0, 16, 36, 60, 92,116,136,152,152,136,116, 92, 60, 36, 16,  0,
 226   0,  8, 24, 44, 60, 80, 96,104,104, 96, 80, 60, 44, 24,  8,  0,
 227   0,  4, 16, 24, 36, 44, 52, 60, 60, 52, 44, 36, 24, 16,  4,  0,
 228   0,  4,  4,  8, 16, 20, 20, 24, 24, 20, 20, 16,  8,  4,  4,  0,
 229   0,  0,  0,  0,  0,  0,  4,  4,  4,  4,  0,  0,  0,  0,  0,  0,
 230 //error:0.000033
 231 };
 232 #elif 1 // 64*linear
 233 static const uint8_t obmc32[1024]={
 234   0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  8,  8,  8,  8,  8,  8,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,
 235   0,  4,  4,  4,  8,  8,  8, 12, 12, 16, 16, 16, 20, 20, 20, 24, 24, 20, 20, 20, 16, 16, 16, 12, 12,  8,  8,  8,  4,  4,  4,  0,
 236   0,  4,  8,  8, 12, 12, 16, 20, 20, 24, 28, 28, 32, 32, 36, 40, 40, 36, 32, 32, 28, 28, 24, 20, 20, 16, 12, 12,  8,  8,  4,  0,
 237   0,  4,  8, 12, 16, 20, 24, 28, 28, 32, 36, 40, 44, 48, 52, 56, 56, 52, 48, 44, 40, 36, 32, 28, 28, 24, 20, 16, 12,  8,  4,  0,
 238   4,  8, 12, 16, 20, 24, 28, 32, 40, 44, 48, 52, 56, 60, 64, 68, 68, 64, 60, 56, 52, 48, 44, 40, 32, 28, 24, 20, 16, 12,  8,  4,
 239   4,  8, 12, 20, 24, 32, 36, 40, 48, 52, 56, 64, 68, 76, 80, 84, 84, 80, 76, 68, 64, 56, 52, 48, 40, 36, 32, 24, 20, 12,  8,  4,
 240   4,  8, 16, 24, 28, 36, 44, 48, 56, 60, 68, 76, 80, 88, 96,100,100, 96, 88, 80, 76, 68, 60, 56, 48, 44, 36, 28, 24, 16,  8,  4,
 241   4, 12, 20, 28, 32, 40, 48, 56, 64, 72, 80, 88, 92,100,108,116,116,108,100, 92, 88, 80, 72, 64, 56, 48, 40, 32, 28, 20, 12,  4,
 242   4, 12, 20, 28, 40, 48, 56, 64, 72, 80, 88, 96,108,116,124,132,132,124,116,108, 96, 88, 80, 72, 64, 56, 48, 40, 28, 20, 12,  4,
 243   4, 16, 24, 32, 44, 52, 60, 72, 80, 92,100,108,120,128,136,148,148,136,128,120,108,100, 92, 80, 72, 60, 52, 44, 32, 24, 16,  4,
 244   4, 16, 28, 36, 48, 56, 68, 80, 88,100,112,120,132,140,152,164,164,152,140,132,120,112,100, 88, 80, 68, 56, 48, 36, 28, 16,  4,
 245   4, 16, 28, 40, 52, 64, 76, 88, 96,108,120,132,144,156,168,180,180,168,156,144,132,120,108, 96, 88, 76, 64, 52, 40, 28, 16,  4,
 246   8, 20, 32, 44, 56, 68, 80, 92,108,120,132,144,156,168,180,192,192,180,168,156,144,132,120,108, 92, 80, 68, 56, 44, 32, 20,  8,
 247   8, 20, 32, 48, 60, 76, 88,100,116,128,140,156,168,184,196,208,208,196,184,168,156,140,128,116,100, 88, 76, 60, 48, 32, 20,  8,
 248   8, 20, 36, 52, 64, 80, 96,108,124,136,152,168,180,196,212,224,224,212,196,180,168,152,136,124,108, 96, 80, 64, 52, 36, 20,  8,
 249   8, 24, 40, 56, 68, 84,100,116,132,148,164,180,192,208,224,240,240,224,208,192,180,164,148,132,116,100, 84, 68, 56, 40, 24,  8,
 250   8, 24, 40, 56, 68, 84,100,116,132,148,164,180,192,208,224,240,240,224,208,192,180,164,148,132,116,100, 84, 68, 56, 40, 24,  8,
 251   8, 20, 36, 52, 64, 80, 96,108,124,136,152,168,180,196,212,224,224,212,196,180,168,152,136,124,108, 96, 80, 64, 52, 36, 20,  8,
 252   8, 20, 32, 48, 60, 76, 88,100,116,128,140,156,168,184,196,208,208,196,184,168,156,140,128,116,100, 88, 76, 60, 48, 32, 20,  8,
 253   8, 20, 32, 44, 56, 68, 80, 92,108,120,132,144,156,168,180,192,192,180,168,156,144,132,120,108, 92, 80, 68, 56, 44, 32, 20,  8,
 254   4, 16, 28, 40, 52, 64, 76, 88, 96,108,120,132,144,156,168,180,180,168,156,144,132,120,108, 96, 88, 76, 64, 52, 40, 28, 16,  4,
 255   4, 16, 28, 36, 48, 56, 68, 80, 88,100,112,120,132,140,152,164,164,152,140,132,120,112,100, 88, 80, 68, 56, 48, 36, 28, 16,  4,
 256   4, 16, 24, 32, 44, 52, 60, 72, 80, 92,100,108,120,128,136,148,148,136,128,120,108,100, 92, 80, 72, 60, 52, 44, 32, 24, 16,  4,
 257   4, 12, 20, 28, 40, 48, 56, 64, 72, 80, 88, 96,108,116,124,132,132,124,116,108, 96, 88, 80, 72, 64, 56, 48, 40, 28, 20, 12,  4,
 258   4, 12, 20, 28, 32, 40, 48, 56, 64, 72, 80, 88, 92,100,108,116,116,108,100, 92, 88, 80, 72, 64, 56, 48, 40, 32, 28, 20, 12,  4,
 259   4,  8, 16, 24, 28, 36, 44, 48, 56, 60, 68, 76, 80, 88, 96,100,100, 96, 88, 80, 76, 68, 60, 56, 48, 44, 36, 28, 24, 16,  8,  4,
 260   4,  8, 12, 20, 24, 32, 36, 40, 48, 52, 56, 64, 68, 76, 80, 84, 84, 80, 76, 68, 64, 56, 52, 48, 40, 36, 32, 24, 20, 12,  8,  4,
 261   4,  8, 12, 16, 20, 24, 28, 32, 40, 44, 48, 52, 56, 60, 64, 68, 68, 64, 60, 56, 52, 48, 44, 40, 32, 28, 24, 20, 16, 12,  8,  4,
 262   0,  4,  8, 12, 16, 20, 24, 28, 28, 32, 36, 40, 44, 48, 52, 56, 56, 52, 48, 44, 40, 36, 32, 28, 28, 24, 20, 16, 12,  8,  4,  0,
 263   0,  4,  8,  8, 12, 12, 16, 20, 20, 24, 28, 28, 32, 32, 36, 40, 40, 36, 32, 32, 28, 28, 24, 20, 20, 16, 12, 12,  8,  8,  4,  0,
 264   0,  4,  4,  4,  8,  8,  8, 12, 12, 16, 16, 16, 20, 20, 20, 24, 24, 20, 20, 20, 16, 16, 16, 12, 12,  8,  8,  8,  4,  4,  4,  0,
 265   0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  8,  8,  8,  8,  8,  8,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,
 266  //error:0.000020
 267 };
 268 static const uint8_t obmc16[256]={
 269   0,  4,  4,  8,  8, 12, 12, 16, 16, 12, 12,  8,  8,  4,  4,  0,
 270   4,  8, 16, 20, 28, 32, 40, 44, 44, 40, 32, 28, 20, 16,  8,  4,
 271   4, 16, 24, 36, 44, 56, 64, 76, 76, 64, 56, 44, 36, 24, 16,  4,
 272   8, 20, 36, 48, 64, 76, 92,104,104, 92, 76, 64, 48, 36, 20,  8,
 273   8, 28, 44, 64, 80,100,116,136,136,116,100, 80, 64, 44, 28,  8,
 274  12, 32, 56, 76,100,120,144,164,164,144,120,100, 76, 56, 32, 12,
 275  12, 40, 64, 92,116,144,168,196,196,168,144,116, 92, 64, 40, 12,
 276  16, 44, 76,104,136,164,196,224,224,196,164,136,104, 76, 44, 16,
 277  16, 44, 76,104,136,164,196,224,224,196,164,136,104, 76, 44, 16,
 278  12, 40, 64, 92,116,144,168,196,196,168,144,116, 92, 64, 40, 12,
 279  12, 32, 56, 76,100,120,144,164,164,144,120,100, 76, 56, 32, 12,
 280   8, 28, 44, 64, 80,100,116,136,136,116,100, 80, 64, 44, 28,  8,
 281   8, 20, 36, 48, 64, 76, 92,104,104, 92, 76, 64, 48, 36, 20,  8,
 282   4, 16, 24, 36, 44, 56, 64, 76, 76, 64, 56, 44, 36, 24, 16,  4,
 283   4,  8, 16, 20, 28, 32, 40, 44, 44, 40, 32, 28, 20, 16,  8,  4,
 284   0,  4,  4,  8,  8, 12, 12, 16, 16, 12, 12,  8,  8,  4,  4,  0,
 285 //error:0.000015
 286 };
 287 #else //64*cos
 288 static const uint8_t obmc32[1024]={
 289   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 290   0,  0,  0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  4,  4,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,  0,  0,
 291   0,  0,  0,  4,  4,  4,  4,  8,  8, 12, 12, 12, 12, 16, 16, 16, 16, 16, 16, 12, 12, 12, 12,  8,  8,  4,  4,  4,  4,  0,  0,  0,
 292   0,  0,  4,  4,  4,  8,  8, 12, 16, 20, 20, 24, 28, 28, 28, 28, 28, 28, 28, 28, 24, 20, 20, 16, 12,  8,  8,  4,  4,  4,  0,  0,
 293   0,  0,  4,  4,  8, 12, 16, 20, 24, 28, 36, 40, 44, 44, 48, 48, 48, 48, 44, 44, 40, 36, 28, 24, 20, 16, 12,  8,  4,  4,  0,  0,
 294   0,  0,  4,  8, 12, 20, 24, 32, 36, 44, 48, 56, 60, 64, 68, 68, 68, 68, 64, 60, 56, 48, 44, 36, 32, 24, 20, 12,  8,  4,  0,  0,
 295   0,  4,  4,  8, 16, 24, 32, 40, 48, 60, 68, 76, 80, 84, 88, 92, 92, 88, 84, 80, 76, 68, 60, 48, 40, 32, 24, 16,  8,  4,  4,  0,
 296   0,  4,  8, 12, 20, 32, 40, 52, 64, 76, 84, 96,104,108,112,116,116,112,108,104, 96, 84, 76, 64, 52, 40, 32, 20, 12,  8,  4,  0,
 297   0,  4,  8, 16, 24, 36, 48, 64, 76, 92,104,116,124,132,136,140,140,136,132,124,116,104, 92, 76, 64, 48, 36, 24, 16,  8,  4,  0,
 298   0,  4, 12, 20, 28, 44, 60, 76, 92,104,120,136,148,156,160,164,164,160,156,148,136,120,104, 92, 76, 60, 44, 28, 20, 12,  4,  0,
 299   0,  4, 12, 20, 36, 48, 68, 84,104,120,140,152,168,176,184,188,188,184,176,168,152,140,120,104, 84, 68, 48, 36, 20, 12,  4,  0,
 300   0,  4, 12, 24, 36, 56, 76, 96,116,136,152,172,184,196,204,208,208,204,196,184,172,152,136,116, 96, 76, 56, 36, 24, 12,  4,  0,
 301   0,  4, 12, 24, 44, 60, 80,104,124,148,168,184,200,212,224,228,228,224,212,200,184,168,148,124,104, 80, 60, 44, 24, 12,  4,  0,
 302   0,  4, 12, 28, 44, 64, 84,108,132,156,176,196,212,228,236,240,240,236,228,212,196,176,156,132,108, 84, 64, 44, 28, 12,  4,  0,
 303   0,  4, 16, 28, 48, 68, 88,112,136,160,184,204,224,236,244,252,252,244,236,224,204,184,160,136,112, 88, 68, 48, 28, 16,  4,  0,
 304   1,  4, 16, 28, 48, 68, 92,116,140,164,188,208,228,240,252,255,255,252,240,228,208,188,164,140,116, 92, 68, 48, 28, 16,  4,  1,
 305   1,  4, 16, 28, 48, 68, 92,116,140,164,188,208,228,240,252,255,255,252,240,228,208,188,164,140,116, 92, 68, 48, 28, 16,  4,  1,
 306   0,  4, 16, 28, 48, 68, 88,112,136,160,184,204,224,236,244,252,252,244,236,224,204,184,160,136,112, 88, 68, 48, 28, 16,  4,  0,
 307   0,  4, 12, 28, 44, 64, 84,108,132,156,176,196,212,228,236,240,240,236,228,212,196,176,156,132,108, 84, 64, 44, 28, 12,  4,  0,
 308   0,  4, 12, 24, 44, 60, 80,104,124,148,168,184,200,212,224,228,228,224,212,200,184,168,148,124,104, 80, 60, 44, 24, 12,  4,  0,
 309   0,  4, 12, 24, 36, 56, 76, 96,116,136,152,172,184,196,204,208,208,204,196,184,172,152,136,116, 96, 76, 56, 36, 24, 12,  4,  0,
 310   0,  4, 12, 20, 36, 48, 68, 84,104,120,140,152,168,176,184,188,188,184,176,168,152,140,120,104, 84, 68, 48, 36, 20, 12,  4,  0,
 311   0,  4, 12, 20, 28, 44, 60, 76, 92,104,120,136,148,156,160,164,164,160,156,148,136,120,104, 92, 76, 60, 44, 28, 20, 12,  4,  0,
 312   0,  4,  8, 16, 24, 36, 48, 64, 76, 92,104,116,124,132,136,140,140,136,132,124,116,104, 92, 76, 64, 48, 36, 24, 16,  8,  4,  0,
 313   0,  4,  8, 12, 20, 32, 40, 52, 64, 76, 84, 96,104,108,112,116,116,112,108,104, 96, 84, 76, 64, 52, 40, 32, 20, 12,  8,  4,  0,
 314   0,  4,  4,  8, 16, 24, 32, 40, 48, 60, 68, 76, 80, 84, 88, 92, 92, 88, 84, 80, 76, 68, 60, 48, 40, 32, 24, 16,  8,  4,  4,  0,
 315   0,  0,  4,  8, 12, 20, 24, 32, 36, 44, 48, 56, 60, 64, 68, 68, 68, 68, 64, 60, 56, 48, 44, 36, 32, 24, 20, 12,  8,  4,  0,  0,
 316   0,  0,  4,  4,  8, 12, 16, 20, 24, 28, 36, 40, 44, 44, 48, 48, 48, 48, 44, 44, 40, 36, 28, 24, 20, 16, 12,  8,  4,  4,  0,  0,
 317   0,  0,  4,  4,  4,  8,  8, 12, 16, 20, 20, 24, 28, 28, 28, 28, 28, 28, 28, 28, 24, 20, 20, 16, 12,  8,  8,  4,  4,  4,  0,  0,
 318   0,  0,  0,  4,  4,  4,  4,  8,  8, 12, 12, 12, 12, 16, 16, 16, 16, 16, 16, 12, 12, 12, 12,  8,  8,  4,  4,  4,  4,  0,  0,  0,
 319   0,  0,  0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  4,  4,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,  0,  0,
 320   0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 321 //error:0.000022
 322 };
 323 static const uint8_t obmc16[256]={
 324   0,  0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,  0,
 325   0,  0,  4,  8, 12, 16, 20, 20, 20, 20, 16, 12,  8,  4,  0,  0,
 326   0,  4, 12, 24, 32, 44, 52, 56, 56, 52, 44, 32, 24, 12,  4,  0,
 327   0,  8, 24, 40, 60, 80, 96,104,104, 96, 80, 60, 40, 24,  8,  0,
 328   0, 12, 32, 64, 92,120,140,152,152,140,120, 92, 64, 32, 12,  0,
 329   4, 16, 44, 80,120,156,184,196,196,184,156,120, 80, 44, 16,  4,
 330   4, 20, 52, 96,140,184,216,232,232,216,184,140, 96, 52, 20,  4,
 331   0, 20, 56,104,152,196,232,252,252,232,196,152,104, 56, 20,  0,
 332   0, 20, 56,104,152,196,232,252,252,232,196,152,104, 56, 20,  0,
 333   4, 20, 52, 96,140,184,216,232,232,216,184,140, 96, 52, 20,  4,
 334   4, 16, 44, 80,120,156,184,196,196,184,156,120, 80, 44, 16,  4,
 335   0, 12, 32, 64, 92,120,140,152,152,140,120, 92, 64, 32, 12,  0,
 336   0,  8, 24, 40, 60, 80, 96,104,104, 96, 80, 60, 40, 24,  8,  0,
 337   0,  4, 12, 24, 32, 44, 52, 56, 56, 52, 44, 32, 24, 12,  4,  0,
 338   0,  0,  4,  8, 12, 16, 20, 20, 20, 20, 16, 12,  8,  4,  0,  0,
 339   0,  0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,  0,
 340 //error:0.000022
 341 };
 342 #endif /* 0 */
 343
 344 //linear *64
 345 static const uint8_t obmc8[64]={
 346   4, 12, 20, 28, 28, 20, 12,  4,
 347  12, 36, 60, 84, 84, 60, 36, 12,
 348  20, 60,100,140,140,100, 60, 20,
 349  28, 84,140,196,196,140, 84, 28,
 350  28, 84,140,196,196,140, 84, 28,
 351  20, 60,100,140,140,100, 60, 20,
 352  12, 36, 60, 84, 84, 60, 36, 12,
 353   4, 12, 20, 28, 28, 20, 12,  4,
 354 //error:0.000000
 355 };
 356
 357 //linear *64
 358 static const uint8_t obmc4[16]={
 359  16, 48, 48, 16,
 360  48,144,144, 48,
 361  48,144,144, 48,
 362  16, 48, 48, 16,
 363 //error:0.000000
 364 };
 365
 366 static const uint8_t *obmc_tab[4]={
 367     obmc32, obmc16, obmc8, obmc4
 368 };
 369
 370 static int scale_mv_ref[MAX_REF_FRAMES][MAX_REF_FRAMES];
 371
 372 typedef struct BlockNode{
 373     int16_t mx;
 374     int16_t my;
 375     uint8_t ref;
 376     uint8_t color[3];
 377     uint8_t type;
 378 //#define TYPE_SPLIT    1
 379 #define BLOCK_INTRA   1
 380 #define BLOCK_OPT     2
 381 //#define TYPE_NOCOLOR  4
 382     uint8_t level; //FIXME merge into type?
 383 }BlockNode;
 384
 385 static const BlockNode null_block= { //FIXME add border maybe
 386     .color= {128,128,128},
 387     .mx= 0,
 388     .my= 0,
 389     .ref= 0,
 390     .type= 0,
 391     .level= 0,
 392 };
 393
 394 #define LOG2_MB_SIZE 4
 395 #define MB_SIZE (1<<LOG2_MB_SIZE)
 396 #define ENCODER_EXTRA_BITS 4
 397 #define HTAPS_MAX 8
 398
 399 typedef struct x_and_coeff{
 400     int16_t x;
 401     uint16_t coeff;
 402 } x_and_coeff;
 403
 404 typedef struct SubBand{
 405     int level;
 406     int stride;
 407     int width;
 408     int height;
 409     int qlog;                                   ///< log(qscale)/log[2^(1/6)]
 410     DWTELEM *buf;
 411     IDWTELEM *ibuf;
 412     int buf_x_offset;
 413     int buf_y_offset;
 414     int stride_line; ///< Stride measured in lines, not pixels.
 415     x_and_coeff * x_coeff;
 416     struct SubBand *parent;
 417     uint8_t state[/*7*2*/ 7 + 512][32];
 418 }SubBand;
 419
 420 typedef struct Plane{
 421     int width;
 422     int height;
 423     SubBand band[MAX_DECOMPOSITIONS][4];
 424
 425     int htaps;
 426     int8_t hcoeff[HTAPS_MAX/2];
 427     int diag_mc;
 428     int fast_mc;
 429
 430     int last_htaps;
 431     int8_t last_hcoeff[HTAPS_MAX/2];
 432     int last_diag_mc;
 433 }Plane;
 434
 435 typedef struct SnowContext{
 436 //    MpegEncContext m; // needed for motion estimation, should not be used for anything else, the idea is to make the motion estimation eventually independent of MpegEncContext, so this will be removed then (FIXME/XXX)
 437
 438     AVCodecContext *avctx;
 439     RangeCoder c;
 440     DSPContext dsp;
 441     AVFrame new_picture;
 442     AVFrame input_picture;              ///< new_picture with the internal linesizes
 443     AVFrame current_picture;
 444     AVFrame last_picture[MAX_REF_FRAMES];
 445     uint8_t *halfpel_plane[MAX_REF_FRAMES][4][4];
 446     AVFrame mconly_picture;
 447 //     uint8_t q_context[16];
 448     uint8_t header_state[32];
 449     uint8_t block_state[128 + 32*128];
 450     int keyframe;
 451     int always_reset;
 452     int version;
 453     int spatial_decomposition_type;
 454     int last_spatial_decomposition_type;
 455     int temporal_decomposition_type;
 456     int spatial_decomposition_count;
 457     int last_spatial_decomposition_count;
 458     int temporal_decomposition_count;
 459     int max_ref_frames;
 460     int ref_frames;
 461     int16_t (*ref_mvs[MAX_REF_FRAMES])[2];
 462     uint32_t *ref_scores[MAX_REF_FRAMES];
 463     DWTELEM *spatial_dwt_buffer;
 464     IDWTELEM *spatial_idwt_buffer;
 465     int colorspace_type;
 466     int chroma_h_shift;
 467     int chroma_v_shift;
 468     int spatial_scalability;
 469     int qlog;
 470     int last_qlog;
 471     int lambda;
 472     int lambda2;
 473     int pass1_rc;
 474     int mv_scale;
 475     int last_mv_scale;
 476     int qbias;
 477     int last_qbias;
 478 #define QBIAS_SHIFT 3
 479     int b_width;
 480     int b_height;
 481     int block_max_depth;
 482     int last_block_max_depth;
 483     Plane plane[MAX_PLANES];
 484     BlockNode *block;
 485 #define ME_CACHE_SIZE 1024
 486     int me_cache[ME_CACHE_SIZE];
 487     int me_cache_generation;
 488     slice_buffer sb;
 489
 490     MpegEncContext m; // needed for motion estimation, should not be used for anything else, the idea is to make the motion estimation eventually independent of MpegEncContext, so this will be removed then (FIXME/XXX)
 491 }SnowContext;
 492
 493 typedef struct {
 494     IDWTELEM *b0;
 495     IDWTELEM *b1;
 496     IDWTELEM *b2;
 497     IDWTELEM *b3;
 498     int y;
 499 } dwt_compose_t;
 500
 501 #define slice_buffer_get_line(slice_buf, line_num) ((slice_buf)->line[line_num] ? (slice_buf)->line[line_num] : slice_buffer_load_line((slice_buf), (line_num)))
 502 //#define slice_buffer_get_line(slice_buf, line_num) (slice_buffer_load_line((slice_buf), (line_num)))
 503
 504 static void iterative_me(SnowContext *s);
 505
 506 static void slice_buffer_init(slice_buffer * buf, int line_count, int max_allocated_lines, int line_width, IDWTELEM * base_buffer)
 507 {
 508     int i;
 509
 510     buf->base_buffer = base_buffer;
 511     buf->line_count = line_count;
 512     buf->line_width = line_width;
 513     buf->data_count = max_allocated_lines;
 514     buf->line = av_mallocz (sizeof(IDWTELEM *) * line_count);
 515     buf->data_stack = av_malloc (sizeof(IDWTELEM *) * max_allocated_lines);
 516
 517     for (i = 0; i < max_allocated_lines; i++)
 518     {
 519         buf->data_stack[i] = av_malloc (sizeof(IDWTELEM) * line_width);
 520     }
 521
 522     buf->data_stack_top = max_allocated_lines - 1;
 523 }
 524
 525 static IDWTELEM * slice_buffer_load_line(slice_buffer * buf, int line)
 526 {
 527     int offset;
 528     IDWTELEM * buffer;
 529
 530 //  av_log(NULL, AV_LOG_DEBUG, "Cache hit: %d\n", line);
 531
 532     assert(buf->data_stack_top >= 0);
 533 //  assert(!buf->line[line]);
 534     if (buf->line[line])
 535         return buf->line[line];
 536
 537     offset = buf->line_width * line;
 538     buffer = buf->data_stack[buf->data_stack_top];
 539     buf->data_stack_top--;
 540     buf->line[line] = buffer;
 541
 542 //  av_log(NULL, AV_LOG_DEBUG, "slice_buffer_load_line: line: %d remaining: %d\n", line, buf->data_stack_top + 1);
 543
 544     return buffer;
 545 }
 546
 547 static void slice_buffer_release(slice_buffer * buf, int line)
 548 {
 549     int offset;
 550     IDWTELEM * buffer;
 551
 552     assert(line >= 0 && line < buf->line_count);
 553     assert(buf->line[line]);
 554
 555     offset = buf->line_width * line;
 556     buffer = buf->line[line];
 557     buf->data_stack_top++;
 558     buf->data_stack[buf->data_stack_top] = buffer;
 559     buf->line[line] = NULL;
 560
 561 //  av_log(NULL, AV_LOG_DEBUG, "slice_buffer_release: line: %d remaining: %d\n", line, buf->data_stack_top + 1);
 562 }
 563
 564 static void slice_buffer_flush(slice_buffer * buf)
 565 {
 566     int i;
 567     for (i = 0; i < buf->line_count; i++)
 568     {
 569         if (buf->line[i])
 570         {
 571 //      av_log(NULL, AV_LOG_DEBUG, "slice_buffer_flush: line: %d \n", i);
 572             slice_buffer_release(buf, i);
 573         }
 574     }
 575 }
 576
 577 static void slice_buffer_destroy(slice_buffer * buf)
 578 {
 579     int i;
 580     slice_buffer_flush(buf);
 581
 582     for (i = buf->data_count - 1; i >= 0; i--)
 583     {
 584         av_freep(&buf->data_stack[i]);
 585     }
 586     av_freep(&buf->data_stack);
 587     av_freep(&buf->line);
 588 }
 589
 590 #ifdef __sgi
 591 // Avoid a name clash on SGI IRIX
 592 #undef qexp
 593 #endif
 594 #define QEXPSHIFT (7-FRAC_BITS+8) //FIXME try to change this to 0
 595 static uint8_t qexp[QROOT];
 596
 597 static inline int mirror(int v, int m){
 598     while((unsigned)v > (unsigned)m){
 599         v=-v;
 600         if(v<0) v+= 2*m;
 601     }
 602     return v;
 603 }
 604
 605 static inline void put_symbol(RangeCoder *c, uint8_t *state, int v, int is_signed){
 606     int i;
 607
 608     if(v){
 609         const int a= FFABS(v);
 610         const int e= av_log2(a);
 611 #if 1
 612         const int el= FFMIN(e, 10);
 613         put_rac(c, state+0, 0);
 614
 615         for(i=0; i<el; i++){
 616             put_rac(c, state+1+i, 1);  //1..10
 617         }
 618         for(; i<e; i++){
 619             put_rac(c, state+1+9, 1);  //1..10
 620         }
 621         put_rac(c, state+1+FFMIN(i,9), 0);
 622
 623         for(i=e-1; i>=el; i--){
 624             put_rac(c, state+22+9, (a>>i)&1); //22..31
 625         }
 626         for(; i>=0; i--){
 627             put_rac(c, state+22+i, (a>>i)&1); //22..31
 628         }
 629
 630         if(is_signed)
 631             put_rac(c, state+11 + el, v < 0); //11..21
 632 #else
 633
 634         put_rac(c, state+0, 0);
 635         if(e<=9){
 636             for(i=0; i<e; i++){
 637                 put_rac(c, state+1+i, 1);  //1..10
 638             }
 639             put_rac(c, state+1+i, 0);
 640
 641             for(i=e-1; i>=0; i--){
 642                 put_rac(c, state+22+i, (a>>i)&1); //22..31
 643             }
 644
 645             if(is_signed)
 646                 put_rac(c, state+11 + e, v < 0); //11..21
 647         }else{
 648             for(i=0; i<e; i++){
 649                 put_rac(c, state+1+FFMIN(i,9), 1);  //1..10
 650             }
 651             put_rac(c, state+1+FFMIN(i,9), 0);
 652
 653             for(i=e-1; i>=0; i--){
 654                 put_rac(c, state+22+FFMIN(i,9), (a>>i)&1); //22..31
 655             }
 656
 657             if(is_signed)
 658                 put_rac(c, state+11 + FFMIN(e,10), v < 0); //11..21
 659         }
 660 #endif /* 1 */
 661     }else{
 662         put_rac(c, state+0, 1);
 663     }
 664 }
 665
 666 static inline int get_symbol(RangeCoder *c, uint8_t *state, int is_signed){
 667     if(get_rac(c, state+0))
 668         return 0;
 669     else{
 670         int i, e, a;
 671         e= 0;
 672         while(get_rac(c, state+1 + FFMIN(e,9))){ //1..10
 673             e++;
 674         }
 675
 676         a= 1;
 677         for(i=e-1; i>=0; i--){
 678             a += a + get_rac(c, state+22 + FFMIN(i,9)); //22..31
 679         }
 680
 681         if(is_signed && get_rac(c, state+11 + FFMIN(e,10))) //11..21
 682             return -a;
 683         else
 684             return a;
 685     }
 686 }
 687
 688 static inline void put_symbol2(RangeCoder *c, uint8_t *state, int v, int log2){
 689     int i;
 690     int r= log2>=0 ? 1<<log2 : 1;
 691
 692     assert(v>=0);
 693     assert(log2>=-4);
 694
 695     while(v >= r){
 696         put_rac(c, state+4+log2, 1);
 697         v -= r;
 698         log2++;
 699         if(log2>0) r+=r;
 700     }
 701     put_rac(c, state+4+log2, 0);
 702
 703     for(i=log2-1; i>=0; i--){
 704         put_rac(c, state+31-i, (v>>i)&1);
 705     }
 706 }
 707
 708 static inline int get_symbol2(RangeCoder *c, uint8_t *state, int log2){
 709     int i;
 710     int r= log2>=0 ? 1<<log2 : 1;
 711     int v=0;
 712
 713     assert(log2>=-4);
 714
 715     while(get_rac(c, state+4+log2)){
 716         v+= r;
 717         log2++;
 718         if(log2>0) r+=r;
 719     }
 720
 721     for(i=log2-1; i>=0; i--){
 722         v+= get_rac(c, state+31-i)<<i;
 723     }
 724
 725     return v;
 726 }
 727
 728 static av_always_inline void
 729 lift(DWTELEM *dst, DWTELEM *src, DWTELEM *ref,
 730      int dst_step, int src_step, int ref_step,
 731      int width, int mul, int add, int shift,
 732      int highpass, int inverse){
 733     const int mirror_left= !highpass;
 734     const int mirror_right= (width&1) ^ highpass;
 735     const int w= (width>>1) - 1 + (highpass & width);
 736     int i;
 737
 738 #define LIFT(src, ref, inv) ((src) + ((inv) ? - (ref) : + (ref)))
 739     if(mirror_left){
 740         dst[0] = LIFT(src[0], ((mul*2*ref[0]+add)>>shift), inverse);
 741         dst += dst_step;
 742         src += src_step;
 743     }
 744
 745     for(i=0; i<w; i++){
 746         dst[i*dst_step] =
 747             LIFT(src[i*src_step],
 748                  ((mul*(ref[i*ref_step] + ref[(i+1)*ref_step])+add)>>shift),
 749                  inverse);
 750     }
 751
 752     if(mirror_right){
 753         dst[w*dst_step] =
 754             LIFT(src[w*src_step],
 755                  ((mul*2*ref[w*ref_step]+add)>>shift),
 756                  inverse);
 757     }
 758 }
 759
 760 static av_always_inline void
 761 inv_lift(IDWTELEM *dst, IDWTELEM *src, IDWTELEM *ref,
 762          int dst_step, int src_step, int ref_step,
 763          int width, int mul, int add, int shift,
 764          int highpass, int inverse){
 765     const int mirror_left= !highpass;
 766     const int mirror_right= (width&1) ^ highpass;
 767     const int w= (width>>1) - 1 + (highpass & width);
 768     int i;
 769
 770 #define LIFT(src, ref, inv) ((src) + ((inv) ? - (ref) : + (ref)))
 771     if(mirror_left){
 772         dst[0] = LIFT(src[0], ((mul*2*ref[0]+add)>>shift), inverse);
 773         dst += dst_step;
 774         src += src_step;
 775     }
 776
 777     for(i=0; i<w; i++){
 778         dst[i*dst_step] =
 779             LIFT(src[i*src_step],
 780                  ((mul*(ref[i*ref_step] + ref[(i+1)*ref_step])+add)>>shift),
 781                  inverse);
 782     }
 783
 784     if(mirror_right){
 785         dst[w*dst_step] =
 786             LIFT(src[w*src_step],
 787                  ((mul*2*ref[w*ref_step]+add)>>shift),
 788                  inverse);
 789     }
 790 }
 791
 792 #ifndef liftS
 793 static av_always_inline void
 794 liftS(DWTELEM *dst, DWTELEM *src, DWTELEM *ref,
 795       int dst_step, int src_step, int ref_step,
 796       int width, int mul, int add, int shift,
 797       int highpass, int inverse){
 798     const int mirror_left= !highpass;
 799     const int mirror_right= (width&1) ^ highpass;
 800     const int w= (width>>1) - 1 + (highpass & width);
 801     int i;
 802
 803     assert(shift == 4);
 804 #define LIFTS(src, ref, inv) \
 805         ((inv) ? \
 806             (src) + (((ref) + 4*(src))>>shift): \
 807             -((-16*(src) + (ref) + add/4 + 1 + (5<<25))/(5*4) - (1<<23)))
 808     if(mirror_left){
 809         dst[0] = LIFTS(src[0], mul*2*ref[0]+add, inverse);
 810         dst += dst_step;
 811         src += src_step;
 812     }
 813
 814     for(i=0; i<w; i++){
 815         dst[i*dst_step] =
 816             LIFTS(src[i*src_step],
 817                   mul*(ref[i*ref_step] + ref[(i+1)*ref_step])+add,
 818                   inverse);
 819     }
 820
 821     if(mirror_right){
 822         dst[w*dst_step] =
 823             LIFTS(src[w*src_step], mul*2*ref[w*ref_step]+add, inverse);
 824     }
 825 }
 826 static av_always_inline void
 827 inv_liftS(IDWTELEM *dst, IDWTELEM *src, IDWTELEM *ref,
 828           int dst_step, int src_step, int ref_step,
 829           int width, int mul, int add, int shift,
 830           int highpass, int inverse){
 831     const int mirror_left= !highpass;
 832     const int mirror_right= (width&1) ^ highpass;
 833     const int w= (width>>1) - 1 + (highpass & width);
 834     int i;
 835
 836     assert(shift == 4);
 837 #define LIFTS(src, ref, inv) \
 838     ((inv) ? \
 839         (src) + (((ref) + 4*(src))>>shift): \
 840         -((-16*(src) + (ref) + add/4 + 1 + (5<<25))/(5*4) - (1<<23)))
 841     if(mirror_left){
 842         dst[0] = LIFTS(src[0], mul*2*ref[0]+add, inverse);
 843         dst += dst_step;
 844         src += src_step;
 845     }
 846
 847     for(i=0; i<w; i++){
 848         dst[i*dst_step] =
 849             LIFTS(src[i*src_step],
 850                   mul*(ref[i*ref_step] + ref[(i+1)*ref_step])+add,
 851                   inverse);
 852     }
 853
 854     if(mirror_right){
 855         dst[w*dst_step] =
 856             LIFTS(src[w*src_step], mul*2*ref[w*ref_step]+add, inverse);
 857     }
 858 }
 859 #endif
 860
 861 static void horizontal_decompose53i(DWTELEM *b, int width){
 862     DWTELEM temp[width];
 863     const int width2= width>>1;
 864     int x;
 865     const int w2= (width+1)>>1;
 866
 867     for(x=0; x<width2; x++){
 868         temp[x   ]= b[2*x    ];
 869         temp[x+w2]= b[2*x + 1];
 870     }
 871     if(width&1)
 872         temp[x   ]= b[2*x    ];
 873 #if 0
 874     {
 875     int A1,A2,A3,A4;
 876     A2= temp[1       ];
 877     A4= temp[0       ];
 878     A1= temp[0+width2];
 879     A1 -= (A2 + A4)>>1;
 880     A4 += (A1 + 1)>>1;
 881     b[0+width2] = A1;
 882     b[0       ] = A4;
 883     for(x=1; x+1<width2; x+=2){
 884         A3= temp[x+width2];
 885         A4= temp[x+1     ];
 886         A3 -= (A2 + A4)>>1;
 887         A2 += (A1 + A3 + 2)>>2;
 888         b[x+width2] = A3;
 889         b[x       ] = A2;
 890
 891         A1= temp[x+1+width2];
 892         A2= temp[x+2       ];
 893         A1 -= (A2 + A4)>>1;
 894         A4 += (A1 + A3 + 2)>>2;
 895         b[x+1+width2] = A1;
 896         b[x+1       ] = A4;
 897     }
 898     A3= temp[width-1];
 899     A3 -= A2;
 900     A2 += (A1 + A3 + 2)>>2;
 901     b[width -1] = A3;
 902     b[width2-1] = A2;
 903     }
 904 #else
 905     lift(b+w2, temp+w2, temp, 1, 1, 1, width, -1, 0, 1, 1, 0);
 906     lift(b   , temp   , b+w2, 1, 1, 1, width,  1, 2, 2, 0, 0);
 907 #endif
 908 }
 909
 910 static void vertical_decompose53iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
 911     int i;
 912
 913     for(i=0; i<width; i++){
 914         b1[i] -= (b0[i] + b2[i])>>1;
 915     }
 916 }
 917
 918 static void vertical_decompose53iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
 919     int i;
 920
 921     for(i=0; i<width; i++){
 922         b1[i] += (b0[i] + b2[i] + 2)>>2;
 923     }
 924 }
 925
 926 static void spatial_decompose53i(DWTELEM *buffer, int width, int height, int stride){
 927     int y;
 928     DWTELEM *b0= buffer + mirror(-2-1, height-1)*stride;
 929     DWTELEM *b1= buffer + mirror(-2  , height-1)*stride;
 930
 931     for(y=-2; y<height; y+=2){
 932         DWTELEM *b2= buffer + mirror(y+1, height-1)*stride;
 933         DWTELEM *b3= buffer + mirror(y+2, height-1)*stride;
 934
 935 {START_TIMER
 936         if(y+1<(unsigned)height) horizontal_decompose53i(b2, width);
 937         if(y+2<(unsigned)height) horizontal_decompose53i(b3, width);
 938 STOP_TIMER("horizontal_decompose53i")}
 939
 940 {START_TIMER
 941         if(y+1<(unsigned)height) vertical_decompose53iH0(b1, b2, b3, width);
 942         if(y+0<(unsigned)height) vertical_decompose53iL0(b0, b1, b2, width);
 943 STOP_TIMER("vertical_decompose53i*")}
 944
 945         b0=b2;
 946         b1=b3;
 947     }
 948 }
 949
 950 static void horizontal_decompose97i(DWTELEM *b, int width){
 951     DWTELEM temp[width];
 952     const int w2= (width+1)>>1;
 953
 954     lift (temp+w2, b    +1, b      , 1, 2, 2, width,  W_AM, W_AO, W_AS, 1, 1);
 955     liftS(temp   , b      , temp+w2, 1, 2, 1, width,  W_BM, W_BO, W_BS, 0, 0);
 956     lift (b   +w2, temp+w2, temp   , 1, 1, 1, width,  W_CM, W_CO, W_CS, 1, 0);
 957     lift (b      , temp   , b   +w2, 1, 1, 1, width,  W_DM, W_DO, W_DS, 0, 0);
 958 }
 959
 960
 961 static void vertical_decompose97iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
 962     int i;
 963
 964     for(i=0; i<width; i++){
 965         b1[i] -= (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
 966     }
 967 }
 968
 969 static void vertical_decompose97iH1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
 970     int i;
 971
 972     for(i=0; i<width; i++){
 973         b1[i] += (W_CM*(b0[i] + b2[i])+W_CO)>>W_CS;
 974     }
 975 }
 976
 977 static void vertical_decompose97iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
 978     int i;
 979
 980     for(i=0; i<width; i++){
 981 #ifdef liftS
 982         b1[i] -= (W_BM*(b0[i] + b2[i])+W_BO)>>W_BS;
 983 #else
 984         b1[i] = (16*4*b1[i] - 4*(b0[i] + b2[i]) + W_BO*5 + (5<<27)) / (5*16) - (1<<23);
 985 #endif
 986     }
 987 }
 988
 989 static void vertical_decompose97iL1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
 990     int i;
 991
 992     for(i=0; i<width; i++){
 993         b1[i] += (W_DM*(b0[i] + b2[i])+W_DO)>>W_DS;
 994     }
 995 }
 996
 997 static void spatial_decompose97i(DWTELEM *buffer, int width, int height, int stride){
 998     int y;
 999     DWTELEM *b0= buffer + mirror(-4-1, height-1)*stride;
1000     DWTELEM *b1= buffer + mirror(-4  , height-1)*stride;
1001     DWTELEM *b2= buffer + mirror(-4+1, height-1)*stride;
1002     DWTELEM *b3= buffer + mirror(-4+2, height-1)*stride;
1003
1004     for(y=-4; y<height; y+=2){
1005         DWTELEM *b4= buffer + mirror(y+3, height-1)*stride;
1006         DWTELEM *b5= buffer + mirror(y+4, height-1)*stride;
1007
1008 {START_TIMER
1009         if(y+3<(unsigned)height) horizontal_decompose97i(b4, width);
1010         if(y+4<(unsigned)height) horizontal_decompose97i(b5, width);
1011 if(width>400){
1012 STOP_TIMER("horizontal_decompose97i")
1013 }}
1014
1015 {START_TIMER
1016         if(y+3<(unsigned)height) vertical_decompose97iH0(b3, b4, b5, width);
1017         if(y+2<(unsigned)height) vertical_decompose97iL0(b2, b3, b4, width);
1018         if(y+1<(unsigned)height) vertical_decompose97iH1(b1, b2, b3, width);
1019         if(y+0<(unsigned)height) vertical_decompose97iL1(b0, b1, b2, width);
1020
1021 if(width>400){
1022 STOP_TIMER("vertical_decompose97i")
1023 }}
1024
1025         b0=b2;
1026         b1=b3;
1027         b2=b4;
1028         b3=b5;
1029     }
1030 }
1031
1032 void ff_spatial_dwt(DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
1033     int level;
1034
1035     for(level=0; level<decomposition_count; level++){
1036         switch(type){
1037         case DWT_97: spatial_decompose97i(buffer, width>>level, height>>level, stride<<level); break;
1038         case DWT_53: spatial_decompose53i(buffer, width>>level, height>>level, stride<<level); break;
1039         }
1040     }
1041 }
1042
1043 static void horizontal_compose53i(IDWTELEM *b, int width){
1044     IDWTELEM temp[width];
1045     const int width2= width>>1;
1046     const int w2= (width+1)>>1;
1047     int x;
1048
1049 #if 0
1050     int A1,A2,A3,A4;
1051     A2= temp[1       ];
1052     A4= temp[0       ];
1053     A1= temp[0+width2];
1054     A1 -= (A2 + A4)>>1;
1055     A4 += (A1 + 1)>>1;
1056     b[0+width2] = A1;
1057     b[0       ] = A4;
1058     for(x=1; x+1<width2; x+=2){
1059         A3= temp[x+width2];
1060         A4= temp[x+1     ];
1061         A3 -= (A2 + A4)>>1;
1062         A2 += (A1 + A3 + 2)>>2;
1063         b[x+width2] = A3;
1064         b[x       ] = A2;
1065
1066         A1= temp[x+1+width2];
1067         A2= temp[x+2       ];
1068         A1 -= (A2 + A4)>>1;
1069         A4 += (A1 + A3 + 2)>>2;
1070         b[x+1+width2] = A1;
1071         b[x+1       ] = A4;
1072     }
1073     A3= temp[width-1];
1074     A3 -= A2;
1075     A2 += (A1 + A3 + 2)>>2;
1076     b[width -1] = A3;
1077     b[width2-1] = A2;
1078 #else
1079     inv_lift(temp   , b   , b+w2, 1, 1, 1, width,  1, 2, 2, 0, 1);
1080     inv_lift(temp+w2, b+w2, temp, 1, 1, 1, width, -1, 0, 1, 1, 1);
1081 #endif /* 0 */
1082     for(x=0; x<width2; x++){
1083         b[2*x    ]= temp[x   ];
1084         b[2*x + 1]= temp[x+w2];
1085     }
1086     if(width&1)
1087         b[2*x    ]= temp[x   ];
1088 }
1089
1090 static void vertical_compose53iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width){
1091     int i;
1092
1093     for(i=0; i<width; i++){
1094         b1[i] += (b0[i] + b2[i])>>1;
1095     }
1096 }
1097
1098 static void vertical_compose53iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width){
1099     int i;
1100
1101     for(i=0; i<width; i++){
1102         b1[i] -= (b0[i] + b2[i] + 2)>>2;
1103     }
1104 }
1105
1106 static void spatial_compose53i_buffered_init(dwt_compose_t *cs, slice_buffer * sb, int height, int stride_line){
1107     cs->b0 = slice_buffer_get_line(sb, mirror(-1-1, height-1) * stride_line);
1108     cs->b1 = slice_buffer_get_line(sb, mirror(-1  , height-1) * stride_line);
1109     cs->y = -1;
1110 }
1111
1112 static void spatial_compose53i_init(dwt_compose_t *cs, IDWTELEM *buffer, int height, int stride){
1113     cs->b0 = buffer + mirror(-1-1, height-1)*stride;
1114     cs->b1 = buffer + mirror(-1  , height-1)*stride;
1115     cs->y = -1;
1116 }
1117
1118 static void spatial_compose53i_dy_buffered(dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line){
1119     int y= cs->y;
1120
1121     IDWTELEM *b0= cs->b0;
1122     IDWTELEM *b1= cs->b1;
1123     IDWTELEM *b2= slice_buffer_get_line(sb, mirror(y+1, height-1) * stride_line);
1124     IDWTELEM *b3= slice_buffer_get_line(sb, mirror(y+2, height-1) * stride_line);
1125
1126 {START_TIMER
1127         if(y+1<(unsigned)height) vertical_compose53iL0(b1, b2, b3, width);
1128         if(y+0<(unsigned)height) vertical_compose53iH0(b0, b1, b2, width);
1129 STOP_TIMER("vertical_compose53i*")}
1130
1131 {START_TIMER
1132         if(y-1<(unsigned)height) horizontal_compose53i(b0, width);
1133         if(y+0<(unsigned)height) horizontal_compose53i(b1, width);
1134 STOP_TIMER("horizontal_compose53i")}
1135
1136     cs->b0 = b2;
1137     cs->b1 = b3;
1138     cs->y += 2;
1139 }
1140
1141 static void spatial_compose53i_dy(dwt_compose_t *cs, IDWTELEM *buffer, int width, int height, int stride){
1142     int y= cs->y;
1143     IDWTELEM *b0= cs->b0;
1144     IDWTELEM *b1= cs->b1;
1145     IDWTELEM *b2= buffer + mirror(y+1, height-1)*stride;
1146     IDWTELEM *b3= buffer + mirror(y+2, height-1)*stride;
1147
1148 {START_TIMER
1149         if(y+1<(unsigned)height) vertical_compose53iL0(b1, b2, b3, width);
1150         if(y+0<(unsigned)height) vertical_compose53iH0(b0, b1, b2, width);
1151 STOP_TIMER("vertical_compose53i*")}
1152
1153 {START_TIMER
1154         if(y-1<(unsigned)height) horizontal_compose53i(b0, width);
1155         if(y+0<(unsigned)height) horizontal_compose53i(b1, width);
1156 STOP_TIMER("horizontal_compose53i")}
1157
1158     cs->b0 = b2;
1159     cs->b1 = b3;
1160     cs->y += 2;
1161 }
1162
1163 static void av_unused spatial_compose53i(IDWTELEM *buffer, int width, int height, int stride){
1164     dwt_compose_t cs;
1165     spatial_compose53i_init(&cs, buffer, height, stride);
1166     while(cs.y <= height)
1167         spatial_compose53i_dy(&cs, buffer, width, height, stride);
1168 }
1169
1170
1171 void ff_snow_horizontal_compose97i(IDWTELEM *b, int width){
1172     IDWTELEM temp[width];
1173     const int w2= (width+1)>>1;
1174
1175     inv_lift (temp   , b      , b   +w2, 1, 1, 1, width,  W_DM, W_DO, W_DS, 0, 1);
1176     inv_lift (temp+w2, b   +w2, temp   , 1, 1, 1, width,  W_CM, W_CO, W_CS, 1, 1);
1177     inv_liftS(b      , temp   , temp+w2, 2, 1, 1, width,  W_BM, W_BO, W_BS, 0, 1);
1178     inv_lift (b+1    , temp+w2, b      , 2, 1, 2, width,  W_AM, W_AO, W_AS, 1, 0);
1179 }
1180
1181 static void vertical_compose97iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width){
1182     int i;
1183
1184     for(i=0; i<width; i++){
1185         b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
1186     }
1187 }
1188
1189 static void vertical_compose97iH1(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width){
1190     int i;
1191
1192     for(i=0; i<width; i++){
1193         b1[i] -= (W_CM*(b0[i] + b2[i])+W_CO)>>W_CS;
1194     }
1195 }
1196
1197 static void vertical_compose97iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width){
1198     int i;
1199
1200     for(i=0; i<width; i++){
1201 #ifdef liftS
1202         b1[i] += (W_BM*(b0[i] + b2[i])+W_BO)>>W_BS;
1203 #else
1204         b1[i] += (W_BM*(b0[i] + b2[i])+4*b1[i]+W_BO)>>W_BS;
1205 #endif
1206     }
1207 }
1208
1209 static void vertical_compose97iL1(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width){
1210     int i;
1211
1212     for(i=0; i<width; i++){
1213         b1[i] -= (W_DM*(b0[i] + b2[i])+W_DO)>>W_DS;
1214     }
1215 }
1216
1217 void ff_snow_vertical_compose97i(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
1218     int i;
1219
1220     for(i=0; i<width; i++){
1221         b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
1222         b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
1223 #ifdef liftS
1224         b2[i] += (W_BM*(b1[i] + b3[i])+W_BO)>>W_BS;
1225 #else
1226         b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
1227 #endif
1228         b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
1229     }
1230 }
1231
1232 static void spatial_compose97i_buffered_init(dwt_compose_t *cs, slice_buffer * sb, int height, int stride_line){
1233     cs->b0 = slice_buffer_get_line(sb, mirror(-3-1, height-1) * stride_line);
1234     cs->b1 = slice_buffer_get_line(sb, mirror(-3  , height-1) * stride_line);
1235     cs->b2 = slice_buffer_get_line(sb, mirror(-3+1, height-1) * stride_line);
1236     cs->b3 = slice_buffer_get_line(sb, mirror(-3+2, height-1) * stride_line);
1237     cs->y = -3;
1238 }
1239
1240 static void spatial_compose97i_init(dwt_compose_t *cs, IDWTELEM *buffer, int height, int stride){
1241     cs->b0 = buffer + mirror(-3-1, height-1)*stride;
1242     cs->b1 = buffer + mirror(-3  , height-1)*stride;
1243     cs->b2 = buffer + mirror(-3+1, height-1)*stride;
1244     cs->b3 = buffer + mirror(-3+2, height-1)*stride;
1245     cs->y = -3;
1246 }
1247
1248 static void spatial_compose97i_dy_buffered(DSPContext *dsp, dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line){
1249     int y = cs->y;
1250
1251     IDWTELEM *b0= cs->b0;
1252     IDWTELEM *b1= cs->b1;
1253     IDWTELEM *b2= cs->b2;
1254     IDWTELEM *b3= cs->b3;
1255     IDWTELEM *b4= slice_buffer_get_line(sb, mirror(y + 3, height - 1) * stride_line);
1256     IDWTELEM *b5= slice_buffer_get_line(sb, mirror(y + 4, height - 1) * stride_line);
1257
1258 {START_TIMER
1259     if(y>0 && y+4<height){
1260         dsp->vertical_compose97i(b0, b1, b2, b3, b4, b5, width);
1261     }else{
1262         if(y+3<(unsigned)height) vertical_compose97iL1(b3, b4, b5, width);
1263         if(y+2<(unsigned)height) vertical_compose97iH1(b2, b3, b4, width);
1264         if(y+1<(unsigned)height) vertical_compose97iL0(b1, b2, b3, width);
1265         if(y+0<(unsigned)height) vertical_compose97iH0(b0, b1, b2, width);
1266     }
1267 if(width>400){
1268 STOP_TIMER("vertical_compose97i")}}
1269
1270 {START_TIMER
1271         if(y-1<(unsigned)height) dsp->horizontal_compose97i(b0, width);
1272         if(y+0<(unsigned)height) dsp->horizontal_compose97i(b1, width);
1273 if(width>400 && y+0<(unsigned)height){
1274 STOP_TIMER("horizontal_compose97i")}}
1275
1276     cs->b0=b2;
1277     cs->b1=b3;
1278     cs->b2=b4;
1279     cs->b3=b5;
1280     cs->y += 2;
1281 }
1282
1283 static void spatial_compose97i_dy(dwt_compose_t *cs, IDWTELEM *buffer, int width, int height, int stride){
1284     int y = cs->y;
1285     IDWTELEM *b0= cs->b0;
1286     IDWTELEM *b1= cs->b1;
1287     IDWTELEM *b2= cs->b2;
1288     IDWTELEM *b3= cs->b3;
1289     IDWTELEM *b4= buffer + mirror(y+3, height-1)*stride;
1290     IDWTELEM *b5= buffer + mirror(y+4, height-1)*stride;
1291
1292 {START_TIMER
1293         if(y+3<(unsigned)height) vertical_compose97iL1(b3, b4, b5, width);
1294         if(y+2<(unsigned)height) vertical_compose97iH1(b2, b3, b4, width);
1295         if(y+1<(unsigned)height) vertical_compose97iL0(b1, b2, b3, width);
1296         if(y+0<(unsigned)height) vertical_compose97iH0(b0, b1, b2, width);
1297 if(width>400){
1298 STOP_TIMER("vertical_compose97i")}}
1299
1300 {START_TIMER
1301         if(y-1<(unsigned)height) ff_snow_horizontal_compose97i(b0, width);
1302         if(y+0<(unsigned)height) ff_snow_horizontal_compose97i(b1, width);
1303 if(width>400 && b0 <= b2){
1304 STOP_TIMER("horizontal_compose97i")}}
1305
1306     cs->b0=b2;
1307     cs->b1=b3;
1308     cs->b2=b4;
1309     cs->b3=b5;
1310     cs->y += 2;
1311 }
1312
1313 static void av_unused spatial_compose97i(IDWTELEM *buffer, int width, int height, int stride){
1314     dwt_compose_t cs;
1315     spatial_compose97i_init(&cs, buffer, height, stride);
1316     while(cs.y <= height)
1317         spatial_compose97i_dy(&cs, buffer, width, height, stride);
1318 }
1319
1320 static void ff_spatial_idwt_buffered_init(dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line, int type, int decomposition_count){
1321     int level;
1322     for(level=decomposition_count-1; level>=0; level--){
1323         switch(type){
1324         case DWT_97: spatial_compose97i_buffered_init(cs+level, sb, height>>level, stride_line<<level); break;
1325         case DWT_53: spatial_compose53i_buffered_init(cs+level, sb, height>>level, stride_line<<level); break;
1326         }
1327     }
1328 }
1329
1330 static void ff_spatial_idwt_init(dwt_compose_t *cs, IDWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
1331     int level;
1332     for(level=decomposition_count-1; level>=0; level--){
1333         switch(type){
1334         case DWT_97: spatial_compose97i_init(cs+level, buffer, height>>level, stride<<level); break;
1335         case DWT_53: spatial_compose53i_init(cs+level, buffer, height>>level, stride<<level); break;
1336         }
1337     }
1338 }
1339
1340 static void ff_spatial_idwt_slice(dwt_compose_t *cs, IDWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count, int y){
1341     const int support = type==1 ? 3 : 5;
1342     int level;
1343     if(type==2) return;
1344
1345     for(level=decomposition_count-1; level>=0; level--){
1346         while(cs[level].y <= FFMIN((y>>level)+support, height>>level)){
1347             switch(type){
1348             case DWT_97: spatial_compose97i_dy(cs+level, buffer, width>>level, height>>level, stride<<level);
1349                     break;
1350             case DWT_53: spatial_compose53i_dy(cs+level, buffer, width>>level, height>>level, stride<<level);
1351                     break;
1352             }
1353         }
1354     }
1355 }
1356
1357 static void ff_spatial_idwt_buffered_slice(DSPContext *dsp, dwt_compose_t *cs, slice_buffer * slice_buf, int width, int height, int stride_line, int type, int decomposition_count, int y){
1358     const int support = type==1 ? 3 : 5;
1359     int level;
1360     if(type==2) return;
1361
1362     for(level=decomposition_count-1; level>=0; level--){
1363         while(cs[level].y <= FFMIN((y>>level)+support, height>>level)){
1364             switch(type){
1365             case DWT_97: spatial_compose97i_dy_buffered(dsp, cs+level, slice_buf, width>>level, height>>level, stride_line<<level);
1366                     break;
1367             case DWT_53: spatial_compose53i_dy_buffered(cs+level, slice_buf, width>>level, height>>level, stride_line<<level);
1368                     break;
1369             }
1370         }
1371     }
1372 }
1373
1374 static void ff_spatial_idwt(IDWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
1375         dwt_compose_t cs[MAX_DECOMPOSITIONS];
1376         int y;
1377         ff_spatial_idwt_init(cs, buffer, width, height, stride, type, decomposition_count);
1378         for(y=0; y<height; y+=4)
1379             ff_spatial_idwt_slice(cs, buffer, width, height, stride, type, decomposition_count, y);
1380 }
1381
1382 static int encode_subband_c0run(SnowContext *s, SubBand *b, IDWTELEM *src, IDWTELEM *parent, int stride, int orientation){
1383     const int w= b->width;
1384     const int h= b->height;
1385     int x, y;
1386
1387     if(1){
1388         int run=0;
1389         int runs[w*h];
1390         int run_index=0;
1391         int max_index;
1392
1393         for(y=0; y<h; y++){
1394             for(x=0; x<w; x++){
1395                 int v, p=0;
1396                 int /*ll=0, */l=0, lt=0, t=0, rt=0;
1397                 v= src[x + y*stride];
1398
1399                 if(y){
1400                     t= src[x + (y-1)*stride];
1401                     if(x){
1402                         lt= src[x - 1 + (y-1)*stride];
1403                     }
1404                     if(x + 1 < w){
1405                         rt= src[x + 1 + (y-1)*stride];
1406                     }
1407                 }
1408                 if(x){
1409                     l= src[x - 1 + y*stride];
1410                     /*if(x > 1){
1411                         if(orientation==1) ll= src[y + (x-2)*stride];
1412                         else               ll= src[x - 2 + y*stride];
1413                     }*/
1414                 }
1415                 if(parent){
1416                     int px= x>>1;
1417                     int py= y>>1;
1418                     if(px<b->parent->width && py<b->parent->height)
1419                         p= parent[px + py*2*stride];
1420                 }
1421                 if(!(/*ll|*/l|lt|t|rt|p)){
1422                     if(v){
1423                         runs[run_index++]= run;
1424                         run=0;
1425                     }else{
1426                         run++;
1427                     }
1428                 }
1429             }
1430         }
1431         max_index= run_index;
1432         runs[run_index++]= run;
1433         run_index=0;
1434         run= runs[run_index++];
1435
1436         put_symbol2(&s->c, b->state[30], max_index, 0);
1437         if(run_index <= max_index)
1438             put_symbol2(&s->c, b->state[1], run, 3);
1439
1440         for(y=0; y<h; y++){
1441             if(s->c.bytestream_end - s->c.bytestream < w*40){
1442                 av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
1443                 return -1;
1444             }
1445             for(x=0; x<w; x++){
1446                 int v, p=0;
1447                 int /*ll=0, */l=0, lt=0, t=0, rt=0;
1448                 v= src[x + y*stride];
1449
1450                 if(y){
1451                     t= src[x + (y-1)*stride];
1452                     if(x){
1453                         lt= src[x - 1 + (y-1)*stride];
1454                     }
1455                     if(x + 1 < w){
1456                         rt= src[x + 1 + (y-1)*stride];
1457                     }
1458                 }
1459                 if(x){
1460                     l= src[x - 1 + y*stride];
1461                     /*if(x > 1){
1462                         if(orientation==1) ll= src[y + (x-2)*stride];
1463                         else               ll= src[x - 2 + y*stride];
1464                     }*/
1465                 }
1466                 if(parent){
1467                     int px= x>>1;
1468                     int py= y>>1;
1469                     if(px<b->parent->width && py<b->parent->height)
1470                         p= parent[px + py*2*stride];
1471                 }
1472                 if(/*ll|*/l|lt|t|rt|p){
1473                     int context= av_log2(/*FFABS(ll) + */3*FFABS(l) + FFABS(lt) + 2*FFABS(t) + FFABS(rt) + FFABS(p));
1474
1475                     put_rac(&s->c, &b->state[0][context], !!v);
1476                 }else{
1477                     if(!run){
1478                         run= runs[run_index++];
1479
1480                         if(run_index <= max_index)
1481                             put_symbol2(&s->c, b->state[1], run, 3);
1482                         assert(v);
1483                     }else{
1484                         run--;
1485                         assert(!v);
1486                     }
1487                 }
1488                 if(v){
1489                     int context= av_log2(/*FFABS(ll) + */3*FFABS(l) + FFABS(lt) + 2*FFABS(t) + FFABS(rt) + FFABS(p));
1490                     int l2= 2*FFABS(l) + (l<0);
1491                     int t2= 2*FFABS(t) + (t<0);
1492
1493                     put_symbol2(&s->c, b->state[context + 2], FFABS(v)-1, context-4);
1494                     put_rac(&s->c, &b->state[0][16 + 1 + 3 + quant3bA[l2&0xFF] + 3*quant3bA[t2&0xFF]], v<0);
1495                 }
1496             }
1497         }
1498     }
1499     return 0;
1500 }
1501
1502 static int encode_subband(SnowContext *s, SubBand *b, IDWTELEM *src, IDWTELEM *parent, int stride, int orientation){
1503 //    encode_subband_qtree(s, b, src, parent, stride, orientation);
1504 //    encode_subband_z0run(s, b, src, parent, stride, orientation);
1505     return encode_subband_c0run(s, b, src, parent, stride, orientation);
1506 //    encode_subband_dzr(s, b, src, parent, stride, orientation);
1507 }
1508
1509 static inline void unpack_coeffs(SnowContext *s, SubBand *b, SubBand * parent, int orientation){
1510     const int w= b->width;
1511     const int h= b->height;
1512     int x,y;
1513
1514     if(1){
1515         int run, runs;
1516         x_and_coeff *xc= b->x_coeff;
1517         x_and_coeff *prev_xc= NULL;
1518         x_and_coeff *prev2_xc= xc;
1519         x_and_coeff *parent_xc= parent ? parent->x_coeff : NULL;
1520         x_and_coeff *prev_parent_xc= parent_xc;
1521
1522         runs= get_symbol2(&s->c, b->state[30], 0);
1523         if(runs-- > 0) run= get_symbol2(&s->c, b->state[1], 3);
1524         else           run= INT_MAX;
1525
1526         for(y=0; y<h; y++){
1527             int v=0;
1528             int lt=0, t=0, rt=0;
1529
1530             if(y && prev_xc->x == 0){
1531                 rt= prev_xc->coeff;
1532             }
1533             for(x=0; x<w; x++){
1534                 int p=0;
1535                 const int l= v;
1536
1537                 lt= t; t= rt;
1538
1539                 if(y){
1540                     if(prev_xc->x <= x)
1541                         prev_xc++;
1542                     if(prev_xc->x == x + 1)
1543                         rt= prev_xc->coeff;
1544                     else
1545                         rt=0;
1546                 }
1547                 if(parent_xc){
1548                     if(x>>1 > parent_xc->x){
1549                         parent_xc++;
1550                     }
1551                     if(x>>1 == parent_xc->x){
1552                         p= parent_xc->coeff;
1553                     }
1554                 }
1555                 if(/*ll|*/l|lt|t|rt|p){
1556                     int context= av_log2(/*FFABS(ll) + */3*(l>>1) + (lt>>1) + (t&~1) + (rt>>1) + (p>>1));
1557
1558                     v=get_rac(&s->c, &b->state[0][context]);
1559                     if(v){
1560                         v= 2*(get_symbol2(&s->c, b->state[context + 2], context-4) + 1);
1561                         v+=get_rac(&s->c, &b->state[0][16 + 1 + 3 + quant3bA[l&0xFF] + 3*quant3bA[t&0xFF]]);
1562
1563                         xc->x=x;
1564                         (xc++)->coeff= v;
1565                     }
1566                 }else{
1567                     if(!run){
1568                         if(runs-- > 0) run= get_symbol2(&s->c, b->state[1], 3);
1569                         else           run= INT_MAX;
1570                         v= 2*(get_symbol2(&s->c, b->state[0 + 2], 0-4) + 1);
1571                         v+=get_rac(&s->c, &b->state[0][16 + 1 + 3]);
1572
1573                         xc->x=x;
1574                         (xc++)->coeff= v;
1575                     }else{
1576                         int max_run;
1577                         run--;
1578                         v=0;
1579
1580                         if(y) max_run= FFMIN(run, prev_xc->x - x - 2);
1581                         else  max_run= FFMIN(run, w-x-1);
1582                         if(parent_xc)
1583                             max_run= FFMIN(max_run, 2*parent_xc->x - x - 1);
1584                         x+= max_run;
1585                         run-= max_run;
1586                     }
1587                 }
1588             }
1589             (xc++)->x= w+1; //end marker
1590             prev_xc= prev2_xc;
1591             prev2_xc= xc;
1592
1593             if(parent_xc){
1594                 if(y&1){
1595                     while(parent_xc->x != parent->width+1)
1596                         parent_xc++;
1597                     parent_xc++;
1598                     prev_parent_xc= parent_xc;
1599                 }else{
1600                     parent_xc= prev_parent_xc;
1601                 }
1602             }
1603         }
1604
1605         (xc++)->x= w+1; //end marker
1606     }
1607 }
1608
1609 static inline void decode_subband_slice_buffered(SnowContext *s, SubBand *b, slice_buffer * sb, int start_y, int h, int save_state[1]){
1610     const int w= b->width;
1611     int y;
1612     const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
1613     int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
1614     int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
1615     int new_index = 0;
1616
1617     START_TIMER
1618
1619     if(b->ibuf == s->spatial_idwt_buffer || s->qlog == LOSSLESS_QLOG){
1620         qadd= 0;
1621         qmul= 1<<QEXPSHIFT;
1622     }
1623
1624     /* If we are on the second or later slice, restore our index. */
1625     if (start_y != 0)
1626         new_index = save_state[0];
1627
1628
1629     for(y=start_y; y<h; y++){
1630         int x = 0;
1631         int v;
1632         IDWTELEM * line = slice_buffer_get_line(sb, y * b->stride_line + b->buf_y_offset) + b->buf_x_offset;
1633         memset(line, 0, b->width*sizeof(IDWTELEM));
1634         v = b->x_coeff[new_index].coeff;
1635         x = b->x_coeff[new_index++].x;
1636         while(x < w)
1637         {
1638             register int t= ( (v>>1)*qmul + qadd)>>QEXPSHIFT;
1639             register int u= -(v&1);
1640             line[x] = (t^u) - u;
1641
1642             v = b->x_coeff[new_index].coeff;
1643             x = b->x_coeff[new_index++].x;
1644         }
1645     }
1646     if(w > 200 && start_y != 0/*level+1 == s->spatial_decomposition_count*/){
1647         STOP_TIMER("decode_subband")
1648     }
1649
1650     /* Save our variables for the next slice. */
1651     save_state[0] = new_index;
1652
1653     return;
1654 }
1655
1656 static void reset_contexts(SnowContext *s){ //FIXME better initial contexts
1657     int plane_index, level, orientation;
1658
1659     for(plane_index=0; plane_index<3; plane_index++){
1660         for(level=0; level<MAX_DECOMPOSITIONS; level++){
1661             for(orientation=level ? 1:0; orientation<4; orientation++){
1662                 memset(s->plane[plane_index].band[level][orientation].state, MID_STATE, sizeof(s->plane[plane_index].band[level][orientation].state));
1663             }
1664         }
1665     }
1666     memset(s->header_state, MID_STATE, sizeof(s->header_state));
1667     memset(s->block_state, MID_STATE, sizeof(s->block_state));
1668 }
1669
1670 static int alloc_blocks(SnowContext *s){
1671     int w= -((-s->avctx->width )>>LOG2_MB_SIZE);
1672     int h= -((-s->avctx->height)>>LOG2_MB_SIZE);
1673
1674     s->b_width = w;
1675     s->b_height= h;
1676
1677     s->block= av_mallocz(w * h * sizeof(BlockNode) << (s->block_max_depth*2));
1678     return 0;
1679 }
1680
1681 static inline void copy_rac_state(RangeCoder *d, RangeCoder *s){
1682     uint8_t *bytestream= d->bytestream;
1683     uint8_t *bytestream_start= d->bytestream_start;
1684     *d= *s;
1685     d->bytestream= bytestream;
1686     d->bytestream_start= bytestream_start;
1687 }
1688
1689 //near copy & paste from dsputil, FIXME
1690 static int pix_sum(uint8_t * pix, int line_size, int w)
1691 {
1692     int s, i, j;
1693
1694     s = 0;
1695     for (i = 0; i < w; i++) {
1696         for (j = 0; j < w; j++) {
1697             s += pix[0];
1698             pix ++;
1699         }
1700         pix += line_size - w;
1701     }
1702     return s;
1703 }
1704
1705 //near copy & paste from dsputil, FIXME
1706 static int pix_norm1(uint8_t * pix, int line_size, int w)
1707 {
1708     int s, i, j;
1709     uint32_t *sq = ff_squareTbl + 256;
1710
1711     s = 0;
1712     for (i = 0; i < w; i++) {
1713         for (j = 0; j < w; j ++) {
1714             s += sq[pix[0]];
1715             pix ++;
1716         }
1717         pix += line_size - w;
1718     }
1719     return s;
1720 }
1721
1722 static inline void set_blocks(SnowContext *s, int level, int x, int y, int l, int cb, int cr, int mx, int my, int ref, int type){
1723     const int w= s->b_width << s->block_max_depth;
1724     const int rem_depth= s->block_max_depth - level;
1725     const int index= (x + y*w) << rem_depth;
1726     const int block_w= 1<<rem_depth;
1727     BlockNode block;
1728     int i,j;
1729
1730     block.color[0]= l;
1731     block.color[1]= cb;
1732     block.color[2]= cr;
1733     block.mx= mx;
1734     block.my= my;
1735     block.ref= ref;
1736     block.type= type;
1737     block.level= level;
1738
1739     for(j=0; j<block_w; j++){
1740         for(i=0; i<block_w; i++){
1741             s->block[index + i + j*w]= block;
1742         }
1743     }
1744 }
1745
1746 static inline void init_ref(MotionEstContext *c, uint8_t *src[3], uint8_t *ref[3], uint8_t *ref2[3], int x, int y, int ref_index){
1747     const int offset[3]= {
1748           y*c->  stride + x,
1749         ((y*c->uvstride + x)>>1),
1750         ((y*c->uvstride + x)>>1),
1751     };
1752     int i;
1753     for(i=0; i<3; i++){
1754         c->src[0][i]= src [i];
1755         c->ref[0][i]= ref [i] + offset[i];
1756     }
1757     assert(!ref_index);
1758 }
1759
1760 static inline void pred_mv(SnowContext *s, int *mx, int *my, int ref,
1761                            const BlockNode *left, const BlockNode *top, const BlockNode *tr){
1762     if(s->ref_frames == 1){
1763         *mx = mid_pred(left->mx, top->mx, tr->mx);
1764         *my = mid_pred(left->my, top->my, tr->my);
1765     }else{
1766         const int *scale = scale_mv_ref[ref];
1767         *mx = mid_pred((left->mx * scale[left->ref] + 128) >>8,
1768                        (top ->mx * scale[top ->ref] + 128) >>8,
1769                        (tr  ->mx * scale[tr  ->ref] + 128) >>8);
1770         *my = mid_pred((left->my * scale[left->ref] + 128) >>8,
1771                        (top ->my * scale[top ->ref] + 128) >>8,
1772                        (tr  ->my * scale[tr  ->ref] + 128) >>8);
1773     }
1774 }
1775
1776 //FIXME copy&paste
1777 #define P_LEFT P[1]
1778 #define P_TOP P[2]
1779 #define P_TOPRIGHT P[3]
1780 #define P_MEDIAN P[4]
1781 #define P_MV1 P[9]
1782 #define FLAG_QPEL   1 //must be 1
1783
1784 static int encode_q_branch(SnowContext *s, int level, int x, int y){
1785     uint8_t p_buffer[1024];
1786     uint8_t i_buffer[1024];
1787     uint8_t p_state[sizeof(s->block_state)];
1788     uint8_t i_state[sizeof(s->block_state)];
1789     RangeCoder pc, ic;
1790     uint8_t *pbbak= s->c.bytestream;
1791     uint8_t *pbbak_start= s->c.bytestream_start;
1792     int score, score2, iscore, i_len, p_len, block_s, sum, base_bits;
1793     const int w= s->b_width  << s->block_max_depth;
1794     const int h= s->b_height << s->block_max_depth;
1795     const int rem_depth= s->block_max_depth - level;
1796     const int index= (x + y*w) << rem_depth;
1797     const int block_w= 1<<(LOG2_MB_SIZE - level);
1798     int trx= (x+1)<<rem_depth;
1799     int try= (y+1)<<rem_depth;
1800     const BlockNode *left  = x ? &s->block[index-1] : &null_block;
1801     const BlockNode *top   = y ? &s->block[index-w] : &null_block;
1802     const BlockNode *right = trx<w ? &s->block[index+1] : &null_block;
1803     const BlockNode *bottom= try<h ? &s->block[index+w] : &null_block;
1804     const BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
1805     const BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
1806     int pl = left->color[0];
1807     int pcb= left->color[1];
1808     int pcr= left->color[2];
1809     int pmx, pmy;
1810     int mx=0, my=0;
1811     int l,cr,cb;
1812     const int stride= s->current_picture.linesize[0];
1813     const int uvstride= s->current_picture.linesize[1];
1814     uint8_t *current_data[3]= { s->input_picture.data[0] + (x + y*  stride)*block_w,
1815                                 s->input_picture.data[1] + (x + y*uvstride)*block_w/2,
1816                                 s->input_picture.data[2] + (x + y*uvstride)*block_w/2};
1817     int P[10][2];
1818     int16_t last_mv[3][2];
1819     int qpel= !!(s->avctx->flags & CODEC_FLAG_QPEL); //unused
1820     const int shift= 1+qpel;
1821     MotionEstContext *c= &s->m.me;
1822     int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
1823     int mx_context= av_log2(2*FFABS(left->mx - top->mx));
1824     int my_context= av_log2(2*FFABS(left->my - top->my));
1825     int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
1826     int ref, best_ref, ref_score, ref_mx, ref_my;
1827
1828     assert(sizeof(s->block_state) >= 256);
1829     if(s->keyframe){
1830         set_blocks(s, level, x, y, pl, pcb, pcr, 0, 0, 0, BLOCK_INTRA);
1831         return 0;
1832     }
1833
1834 //    clip predictors / edge ?
1835
1836     P_LEFT[0]= left->mx;
1837     P_LEFT[1]= left->my;
1838     P_TOP [0]= top->mx;
1839     P_TOP [1]= top->my;
1840     P_TOPRIGHT[0]= tr->mx;
1841     P_TOPRIGHT[1]= tr->my;
1842
1843     last_mv[0][0]= s->block[index].mx;
1844     last_mv[0][1]= s->block[index].my;
1845     last_mv[1][0]= right->mx;
1846     last_mv[1][1]= right->my;
1847     last_mv[2][0]= bottom->mx;
1848     last_mv[2][1]= bottom->my;
1849
1850     s->m.mb_stride=2;
1851     s->m.mb_x=
1852     s->m.mb_y= 0;
1853     c->skip= 0;
1854
1855     assert(c->  stride ==   stride);
1856     assert(c->uvstride == uvstride);
1857
1858     c->penalty_factor    = get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_cmp);
1859     c->sub_penalty_factor= get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_sub_cmp);
1860     c->mb_penalty_factor = get_penalty_factor(s->lambda, s->lambda2, c->avctx->mb_cmp);
1861     c->current_mv_penalty= c->mv_penalty[s->m.f_code=1] + MAX_MV;
1862
1863     c->xmin = - x*block_w - 16+2;
1864     c->ymin = - y*block_w - 16+2;
1865     c->xmax = - (x+1)*block_w + (w<<(LOG2_MB_SIZE - s->block_max_depth)) + 16-2;
1866     c->ymax = - (y+1)*block_w + (h<<(LOG2_MB_SIZE - s->block_max_depth)) + 16-2;
1867
1868     if(P_LEFT[0]     > (c->xmax<<shift)) P_LEFT[0]    = (c->xmax<<shift);
1869     if(P_LEFT[1]     > (c->ymax<<shift)) P_LEFT[1]    = (c->ymax<<shift);
1870     if(P_TOP[0]      > (c->xmax<<shift)) P_TOP[0]     = (c->xmax<<shift);
1871     if(P_TOP[1]      > (c->ymax<<shift)) P_TOP[1]     = (c->ymax<<shift);
1872     if(P_TOPRIGHT[0] < (c->xmin<<shift)) P_TOPRIGHT[0]= (c->xmin<<shift);
1873     if(P_TOPRIGHT[0] > (c->xmax<<shift)) P_TOPRIGHT[0]= (c->xmax<<shift); //due to pmx no clip
1874     if(P_TOPRIGHT[1] > (c->ymax<<shift)) P_TOPRIGHT[1]= (c->ymax<<shift);
1875
1876     P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]);
1877     P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
1878
1879     if (!y) {
1880         c->pred_x= P_LEFT[0];
1881         c->pred_y= P_LEFT[1];
1882     } else {
1883         c->pred_x = P_MEDIAN[0];
1884         c->pred_y = P_MEDIAN[1];
1885     }
1886
1887     score= INT_MAX;
1888     best_ref= 0;
1889     for(ref=0; ref<s->ref_frames; ref++){
1890         init_ref(c, current_data, s->last_picture[ref].data, NULL, block_w*x, block_w*y, 0);
1891
1892         ref_score= ff_epzs_motion_search(&s->m, &ref_mx, &ref_my, P, 0, /*ref_index*/ 0, last_mv,
1893                                          (1<<16)>>shift, level-LOG2_MB_SIZE+4, block_w);
1894
1895         assert(ref_mx >= c->xmin);
1896         assert(ref_mx <= c->xmax);
1897         assert(ref_my >= c->ymin);
1898         assert(ref_my <= c->ymax);
1899
1900         ref_score= c->sub_motion_search(&s->m, &ref_mx, &ref_my, ref_score, 0, 0, level-LOG2_MB_SIZE+4, block_w);
1901         ref_score= ff_get_mb_score(&s->m, ref_mx, ref_my, 0, 0, level-LOG2_MB_SIZE+4, block_w, 0);
1902         ref_score+= 2*av_log2(2*ref)*c->penalty_factor;
1903         if(s->ref_mvs[ref]){
1904             s->ref_mvs[ref][index][0]= ref_mx;
1905             s->ref_mvs[ref][index][1]= ref_my;
1906             s->ref_scores[ref][index]= ref_score;
1907         }
1908         if(score > ref_score){
1909             score= ref_score;
1910             best_ref= ref;
1911             mx= ref_mx;
1912             my= ref_my;
1913         }
1914     }
1915     //FIXME if mb_cmp != SSE then intra cannot be compared currently and mb_penalty vs. lambda2
1916
1917   //  subpel search
1918     base_bits= get_rac_count(&s->c) - 8*(s->c.bytestream - s->c.bytestream_start);
1919     pc= s->c;
1920     pc.bytestream_start=
1921     pc.bytestream= p_buffer; //FIXME end/start? and at the other stoo
1922     memcpy(p_state, s->block_state, sizeof(s->block_state));
1923
1924     if(level!=s->block_max_depth)
1925         put_rac(&pc, &p_state[4 + s_context], 1);
1926     put_rac(&pc, &p_state[1 + left->type + top->type], 0);
1927     if(s->ref_frames > 1)
1928         put_symbol(&pc, &p_state[128 + 1024 + 32*ref_context], best_ref, 0);
1929     pred_mv(s, &pmx, &pmy, best_ref, left, top, tr);
1930     put_symbol(&pc, &p_state[128 + 32*(mx_context + 16*!!best_ref)], mx - pmx, 1);
1931     put_symbol(&pc, &p_state[128 + 32*(my_context + 16*!!best_ref)], my - pmy, 1);
1932     p_len= pc.bytestream - pc.bytestream_start;
1933     score += (s->lambda2*(get_rac_count(&pc)-base_bits))>>FF_LAMBDA_SHIFT;
1934
1935     block_s= block_w*block_w;
1936     sum = pix_sum(current_data[0], stride, block_w);
1937     l= (sum + block_s/2)/block_s;
1938     iscore = pix_norm1(current_data[0], stride, block_w) - 2*l*sum + l*l*block_s;
1939
1940     block_s= block_w*block_w>>2;
1941     sum = pix_sum(current_data[1], uvstride, block_w>>1);
1942     cb= (sum + block_s/2)/block_s;
1943 //    iscore += pix_norm1(&current_mb[1][0], uvstride, block_w>>1) - 2*cb*sum + cb*cb*block_s;
1944     sum = pix_sum(current_data[2], uvstride, block_w>>1);
1945     cr= (sum + block_s/2)/block_s;
1946 //    iscore += pix_norm1(&current_mb[2][0], uvstride, block_w>>1) - 2*cr*sum + cr*cr*block_s;
1947
1948     ic= s->c;
1949     ic.bytestream_start=
1950     ic.bytestream= i_buffer; //FIXME end/start? and at the other stoo
1951     memcpy(i_state, s->block_state, sizeof(s->block_state));
1952     if(level!=s->block_max_depth)
1953         put_rac(&ic, &i_state[4 + s_context], 1);
1954     put_rac(&ic, &i_state[1 + left->type + top->type], 1);
1955     put_symbol(&ic, &i_state[32],  l-pl , 1);
1956     put_symbol(&ic, &i_state[64], cb-pcb, 1);
1957     put_symbol(&ic, &i_state[96], cr-pcr, 1);
1958     i_len= ic.bytestream - ic.bytestream_start;
1959     iscore += (s->lambda2*(get_rac_count(&ic)-base_bits))>>FF_LAMBDA_SHIFT;
1960
1961 //    assert(score==256*256*256*64-1);
1962     assert(iscore < 255*255*256 + s->lambda2*10);
1963     assert(iscore >= 0);
1964     assert(l>=0 && l<=255);
1965     assert(pl>=0 && pl<=255);
1966
1967     if(level==0){
1968         int varc= iscore >> 8;
1969         int vard= score >> 8;
1970         if (vard <= 64 || vard < varc)
1971             c->scene_change_score+= ff_sqrt(vard) - ff_sqrt(varc);
1972         else
1973             c->scene_change_score+= s->m.qscale;
1974     }
1975
1976     if(level!=s->block_max_depth){
1977         put_rac(&s->c, &s->block_state[4 + s_context], 0);
1978         score2 = encode_q_branch(s, level+1, 2*x+0, 2*y+0);
1979         score2+= encode_q_branch(s, level+1, 2*x+1, 2*y+0);
1980         score2+= encode_q_branch(s, level+1, 2*x+0, 2*y+1);
1981         score2+= encode_q_branch(s, level+1, 2*x+1, 2*y+1);
1982         score2+= s->lambda2>>FF_LAMBDA_SHIFT; //FIXME exact split overhead
1983
1984         if(score2 < score && score2 < iscore)
1985             return score2;
1986     }
1987
1988     if(iscore < score){
1989         pred_mv(s, &pmx, &pmy, 0, left, top, tr);
1990         memcpy(pbbak, i_buffer, i_len);
1991         s->c= ic;
1992         s->c.bytestream_start= pbbak_start;
1993         s->c.bytestream= pbbak + i_len;
1994         set_blocks(s, level, x, y, l, cb, cr, pmx, pmy, 0, BLOCK_INTRA);
1995         memcpy(s->block_state, i_state, sizeof(s->block_state));
1996         return iscore;
1997     }else{
1998         memcpy(pbbak, p_buffer, p_len);
1999         s->c= pc;
2000         s->c.bytestream_start= pbbak_start;
2001         s->c.bytestream= pbbak + p_len;
2002         set_blocks(s, level, x, y, pl, pcb, pcr, mx, my, best_ref, 0);
2003         memcpy(s->block_state, p_state, sizeof(s->block_state));
2004         return score;
2005     }
2006 }
2007
2008 static av_always_inline int same_block(BlockNode *a, BlockNode *b){
2009     if((a->type&BLOCK_INTRA) && (b->type&BLOCK_INTRA)){
2010         return !((a->color[0] - b->color[0]) | (a->color[1] - b->color[1]) | (a->color[2] - b->color[2]));
2011     }else{
2012         return !((a->mx - b->mx) | (a->my - b->my) | (a->ref - b->ref) | ((a->type ^ b->type)&BLOCK_INTRA));
2013     }
2014 }
2015
2016 static void encode_q_branch2(SnowContext *s, int level, int x, int y){
2017     const int w= s->b_width  << s->block_max_depth;
2018     const int rem_depth= s->block_max_depth - level;
2019     const int index= (x + y*w) << rem_depth;
2020     int trx= (x+1)<<rem_depth;
2021     BlockNode *b= &s->block[index];
2022     const BlockNode *left  = x ? &s->block[index-1] : &null_block;
2023     const BlockNode *top   = y ? &s->block[index-w] : &null_block;
2024     const BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
2025     const BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
2026     int pl = left->color[0];
2027     int pcb= left->color[1];
2028     int pcr= left->color[2];
2029     int pmx, pmy;
2030     int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
2031     int mx_context= av_log2(2*FFABS(left->mx - top->mx)) + 16*!!b->ref;
2032     int my_context= av_log2(2*FFABS(left->my - top->my)) + 16*!!b->ref;
2033     int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
2034
2035     if(s->keyframe){
2036         set_blocks(s, level, x, y, pl, pcb, pcr, 0, 0, 0, BLOCK_INTRA);
2037         return;
2038     }
2039
2040     if(level!=s->block_max_depth){
2041         if(same_block(b,b+1) && same_block(b,b+w) && same_block(b,b+w+1)){
2042             put_rac(&s->c, &s->block_state[4 + s_context], 1);
2043         }else{
2044             put_rac(&s->c, &s->block_state[4 + s_context], 0);
2045             encode_q_branch2(s, level+1, 2*x+0, 2*y+0);
2046             encode_q_branch2(s, level+1, 2*x+1, 2*y+0);
2047             encode_q_branch2(s, level+1, 2*x+0, 2*y+1);
2048             encode_q_branch2(s, level+1, 2*x+1, 2*y+1);
2049             return;
2050         }
2051     }
2052     if(b->type & BLOCK_INTRA){
2053         pred_mv(s, &pmx, &pmy, 0, left, top, tr);
2054         put_rac(&s->c, &s->block_state[1 + (left->type&1) + (top->type&1)], 1);
2055         put_symbol(&s->c, &s->block_state[32], b->color[0]-pl , 1);
2056         put_symbol(&s->c, &s->block_state[64], b->color[1]-pcb, 1);
2057         put_symbol(&s->c, &s->block_state[96], b->color[2]-pcr, 1);
2058         set_blocks(s, level, x, y, b->color[0], b->color[1], b->color[2], pmx, pmy, 0, BLOCK_INTRA);
2059     }else{
2060         pred_mv(s, &pmx, &pmy, b->ref, left, top, tr);
2061         put_rac(&s->c, &s->block_state[1 + (left->type&1) + (top->type&1)], 0);
2062         if(s->ref_frames > 1)
2063             put_symbol(&s->c, &s->block_state[128 + 1024 + 32*ref_context], b->ref, 0);
2064         put_symbol(&s->c, &s->block_state[128 + 32*mx_context], b->mx - pmx, 1);
2065         put_symbol(&s->c, &s->block_state[128 + 32*my_context], b->my - pmy, 1);
2066         set_blocks(s, level, x, y, pl, pcb, pcr, b->mx, b->my, b->ref, 0);
2067     }
2068 }
2069
2070 static void decode_q_branch(SnowContext *s, int level, int x, int y){
2071     const int w= s->b_width << s->block_max_depth;
2072     const int rem_depth= s->block_max_depth - level;
2073     const int index= (x + y*w) << rem_depth;
2074     int trx= (x+1)<<rem_depth;
2075     const BlockNode *left  = x ? &s->block[index-1] : &null_block;
2076     const BlockNode *top   = y ? &s->block[index-w] : &null_block;
2077     const BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
2078     const BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
2079     int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
2080
2081     if(s->keyframe){
2082         set_blocks(s, level, x, y, null_block.color[0], null_block.color[1], null_block.color[2], null_block.mx, null_block.my, null_block.ref, BLOCK_INTRA);
2083         return;
2084     }
2085
2086     if(level==s->block_max_depth || get_rac(&s->c, &s->block_state[4 + s_context])){
2087         int type, mx, my;
2088         int l = left->color[0];
2089         int cb= left->color[1];
2090         int cr= left->color[2];
2091         int ref = 0;
2092         int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
2093         int mx_context= av_log2(2*FFABS(left->mx - top->mx)) + 0*av_log2(2*FFABS(tr->mx - top->mx));
2094         int my_context= av_log2(2*FFABS(left->my - top->my)) + 0*av_log2(2*FFABS(tr->my - top->my));
2095
2096         type= get_rac(&s->c, &s->block_state[1 + left->type + top->type]) ? BLOCK_INTRA : 0;
2097
2098         if(type){
2099             pred_mv(s, &mx, &my, 0, left, top, tr);
2100             l += get_symbol(&s->c, &s->block_state[32], 1);
2101             cb+= get_symbol(&s->c, &s->block_state[64], 1);
2102             cr+= get_symbol(&s->c, &s->block_state[96], 1);
2103         }else{
2104             if(s->ref_frames > 1)
2105                 ref= get_symbol(&s->c, &s->block_state[128 + 1024 + 32*ref_context], 0);
2106             pred_mv(s, &mx, &my, ref, left, top, tr);
2107             mx+= get_symbol(&s->c, &s->block_state[128 + 32*(mx_context + 16*!!ref)], 1);
2108             my+= get_symbol(&s->c, &s->block_state[128 + 32*(my_context + 16*!!ref)], 1);
2109         }
2110         set_blocks(s, level, x, y, l, cb, cr, mx, my, ref, type);
2111     }else{
2112         decode_q_branch(s, level+1, 2*x+0, 2*y+0);
2113         decode_q_branch(s, level+1, 2*x+1, 2*y+0);
2114         decode_q_branch(s, level+1, 2*x+0, 2*y+1);
2115         decode_q_branch(s, level+1, 2*x+1, 2*y+1);
2116     }
2117 }
2118
2119 static void encode_blocks(SnowContext *s, int search){
2120     int x, y;
2121     int w= s->b_width;
2122     int h= s->b_height;
2123
2124     if(s->avctx->me_method == ME_ITER && !s->keyframe && search)
2125         iterative_me(s);
2126
2127     for(y=0; y<h; y++){
2128         if(s->c.bytestream_end - s->c.bytestream < w*MB_SIZE*MB_SIZE*3){ //FIXME nicer limit
2129             av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
2130             return;
2131         }
2132         for(x=0; x<w; x++){
2133             if(s->avctx->me_method == ME_ITER || !search)
2134                 encode_q_branch2(s, 0, x, y);
2135             else
2136                 encode_q_branch (s, 0, x, y);
2137         }
2138     }
2139 }
2140
2141 static void decode_blocks(SnowContext *s){
2142     int x, y;
2143     int w= s->b_width;
2144     int h= s->b_height;
2145
2146     for(y=0; y<h; y++){
2147         for(x=0; x<w; x++){
2148             decode_q_branch(s, 0, x, y);
2149         }
2150     }
2151 }
2152
2153 static void mc_block(Plane *p, uint8_t *dst, const uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){
2154     const static uint8_t weight[64]={
2155     8,7,6,5,4,3,2,1,
2156     7,7,0,0,0,0,0,1,
2157     6,0,6,0,0,0,2,0,
2158     5,0,0,5,0,3,0,0,
2159     4,0,0,0,4,0,0,0,
2160     3,0,0,5,0,3,0,0,
2161     2,0,6,0,0,0,2,0,
2162     1,7,0,0,0,0,0,1,
2163     };
2164
2165     const static uint8_t brane[256]={
2166     0x00,0x01,0x01,0x01,0x01,0x01,0x01,0x01,0x11,0x12,0x12,0x12,0x12,0x12,0x12,0x12,
2167     0x04,0x05,0xcc,0xcc,0xcc,0xcc,0xcc,0x41,0x15,0x16,0xcc,0xcc,0xcc,0xcc,0xcc,0x52,
2168     0x04,0xcc,0x05,0xcc,0xcc,0xcc,0x41,0xcc,0x15,0xcc,0x16,0xcc,0xcc,0xcc,0x52,0xcc,
2169     0x04,0xcc,0xcc,0x05,0xcc,0x41,0xcc,0xcc,0x15,0xcc,0xcc,0x16,0xcc,0x52,0xcc,0xcc,
2170     0x04,0xcc,0xcc,0xcc,0x41,0xcc,0xcc,0xcc,0x15,0xcc,0xcc,0xcc,0x16,0xcc,0xcc,0xcc,
2171     0x04,0xcc,0xcc,0x41,0xcc,0x05,0xcc,0xcc,0x15,0xcc,0xcc,0x52,0xcc,0x16,0xcc,0xcc,
2172     0x04,0xcc,0x41,0xcc,0xcc,0xcc,0x05,0xcc,0x15,0xcc,0x52,0xcc,0xcc,0xcc,0x16,0xcc,
2173     0x04,0x41,0xcc,0xcc,0xcc,0xcc,0xcc,0x05,0x15,0x52,0xcc,0xcc,0xcc,0xcc,0xcc,0x16,
2174     0x44,0x45,0x45,0x45,0x45,0x45,0x45,0x45,0x55,0x56,0x56,0x56,0x56,0x56,0x56,0x56,
2175     0x48,0x49,0xcc,0xcc,0xcc,0xcc,0xcc,0x85,0x59,0x5A,0xcc,0xcc,0xcc,0xcc,0xcc,0x96,
2176     0x48,0xcc,0x49,0xcc,0xcc,0xcc,0x85,0xcc,0x59,0xcc,0x5A,0xcc,0xcc,0xcc,0x96,0xcc,
2177     0x48,0xcc,0xcc,0x49,0xcc,0x85,0xcc,0xcc,0x59,0xcc,0xcc,0x5A,0xcc,0x96,0xcc,0xcc,
2178     0x48,0xcc,0xcc,0xcc,0x49,0xcc,0xcc,0xcc,0x59,0xcc,0xcc,0xcc,0x96,0xcc,0xcc,0xcc,
2179     0x48,0xcc,0xcc,0x85,0xcc,0x49,0xcc,0xcc,0x59,0xcc,0xcc,0x96,0xcc,0x5A,0xcc,0xcc,
2180     0x48,0xcc,0x85,0xcc,0xcc,0xcc,0x49,0xcc,0x59,0xcc,0x96,0xcc,0xcc,0xcc,0x5A,0xcc,
2181     0x48,0x85,0xcc,0xcc,0xcc,0xcc,0xcc,0x49,0x59,0x96,0xcc,0xcc,0xcc,0xcc,0xcc,0x5A,
2182     };
2183
2184     const static uint8_t needs[16]={
2185     0,1,0,0,
2186     2,4,2,0,
2187     0,1,0,0,
2188     15
2189     };
2190
2191     int x, y, b, r, l;
2192     int16_t tmpIt   [64*(32+HTAPS_MAX)];
2193     uint8_t tmp2t[3][stride*(32+HTAPS_MAX)];
2194     int16_t *tmpI= tmpIt;
2195     uint8_t *tmp2= tmp2t[0];
2196     uint8_t *hpel[11];
2197 START_TIMER
2198     assert(dx<16 && dy<16);
2199     r= brane[dx + 16*dy]&15;
2200     l= brane[dx + 16*dy]>>4;
2201
2202     b= needs[l] | needs[r];
2203     if(p && !p->diag_mc)
2204         b= 15;
2205
2206     if(b&5){
2207         for(y=0; y < b_h+HTAPS_MAX-1; y++){
2208             for(x=0; x < b_w; x++){
2209                 int a_1=src[x + HTAPS_MAX/2-4];
2210                 int a0= src[x + HTAPS_MAX/2-3];
2211                 int a1= src[x + HTAPS_MAX/2-2];
2212                 int a2= src[x + HTAPS_MAX/2-1];
2213                 int a3= src[x + HTAPS_MAX/2+0];
2214                 int a4= src[x + HTAPS_MAX/2+1];
2215                 int a5= src[x + HTAPS_MAX/2+2];
2216                 int a6= src[x + HTAPS_MAX/2+3];
2217                 int am=0;
2218                 if(!p || p->fast_mc){
2219                     am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
2220                     tmpI[x]= am;
2221                     am= (am+16)>>5;
2222                 }else{
2223                     am= p->hcoeff[0]*(a2+a3) + p->hcoeff[1]*(a1+a4) + p->hcoeff[2]*(a0+a5) + p->hcoeff[3]*(a_1+a6);
2224                     tmpI[x]= am;
2225                     am= (am+32)>>6;
2226                 }
2227
2228                 if(am&(~255)) am= ~(am>>31);
2229                 tmp2[x]= am;
2230             }
2231             tmpI+= 64;
2232             tmp2+= stride;
2233             src += stride;
2234         }
2235         src -= stride*y;
2236     }
2237     src += HTAPS_MAX/2 - 1;
2238     tmp2= tmp2t[1];
2239
2240     if(b&2){
2241         for(y=0; y < b_h; y++){
2242             for(x=0; x < b_w+1; x++){
2243                 int a_1=src[x + (HTAPS_MAX/2-4)*stride];
2244                 int a0= src[x + (HTAPS_MAX/2-3)*stride];
2245                 int a1= src[x + (HTAPS_MAX/2-2)*stride];
2246                 int a2= src[x + (HTAPS_MAX/2-1)*stride];
2247                 int a3= src[x + (HTAPS_MAX/2+0)*stride];
2248                 int a4= src[x + (HTAPS_MAX/2+1)*stride];
2249                 int a5= src[x + (HTAPS_MAX/2+2)*stride];
2250                 int a6= src[x + (HTAPS_MAX/2+3)*stride];
2251                 int am=0;
2252                 if(!p || p->fast_mc)
2253                     am= (20*(a2+a3) - 5*(a1+a4) + (a0+a5) + 16)>>5;
2254                 else
2255                     am= (p->hcoeff[0]*(a2+a3) + p->hcoeff[1]*(a1+a4) + p->hcoeff[2]*(a0+a5) + p->hcoeff[3]*(a_1+a6) + 32)>>6;
2256
2257                 if(am&(~255)) am= ~(am>>31);
2258                 tmp2[x]= am;
2259             }
2260             src += stride;
2261             tmp2+= stride;
2262         }
2263         src -= stride*y;
2264     }
2265     src += stride*(HTAPS_MAX/2 - 1);
2266     tmp2= tmp2t[2];
2267     tmpI= tmpIt;
2268     if(b&4){
2269         for(y=0; y < b_h; y++){
2270             for(x=0; x < b_w; x++){
2271                 int a_1=tmpI[x + (HTAPS_MAX/2-4)*64];
2272                 int a0= tmpI[x + (HTAPS_MAX/2-3)*64];
2273                 int a1= tmpI[x + (HTAPS_MAX/2-2)*64];
2274                 int a2= tmpI[x + (HTAPS_MAX/2-1)*64];
2275                 int a3= tmpI[x + (HTAPS_MAX/2+0)*64];
2276                 int a4= tmpI[x + (HTAPS_MAX/2+1)*64];
2277                 int a5= tmpI[x + (HTAPS_MAX/2+2)*64];
2278                 int a6= tmpI[x + (HTAPS_MAX/2+3)*64];
2279                 int am=0;
2280                 if(!p || p->fast_mc)
2281                     am= (20*(a2+a3) - 5*(a1+a4) + (a0+a5) + 512)>>10;
2282                 else
2283                     am= (p->hcoeff[0]*(a2+a3) + p->hcoeff[1]*(a1+a4) + p->hcoeff[2]*(a0+a5) + p->hcoeff[3]*(a_1+a6) + 2048)>>12;
2284                 if(am&(~255)) am= ~(am>>31);
2285                 tmp2[x]= am;
2286             }
2287             tmpI+= 64;
2288             tmp2+= stride;
2289         }
2290     }
2291
2292     hpel[ 0]= src;
2293     hpel[ 1]= tmp2t[0] + stride*(HTAPS_MAX/2-1);
2294     hpel[ 2]= src + 1;
2295
2296     hpel[ 4]= tmp2t[1];
2297     hpel[ 5]= tmp2t[2];
2298     hpel[ 6]= tmp2t[1] + 1;
2299
2300     hpel[ 8]= src + stride;
2301     hpel[ 9]= hpel[1] + stride;
2302     hpel[10]= hpel[8] + 1;
2303
2304     if(b==15){
2305         uint8_t *src1= hpel[dx/8 + dy/8*4  ];
2306         uint8_t *src2= hpel[dx/8 + dy/8*4+1];
2307         uint8_t *src3= hpel[dx/8 + dy/8*4+4];
2308         uint8_t *src4= hpel[dx/8 + dy/8*4+5];
2309         dx&=7;
2310         dy&=7;
2311         for(y=0; y < b_h; y++){
2312             for(x=0; x < b_w; x++){
2313                 dst[x]= ((8-dx)*(8-dy)*src1[x] + dx*(8-dy)*src2[x]+
2314                          (8-dx)*   dy *src3[x] + dx*   dy *src4[x]+32)>>6;
2315             }
2316             src1+=stride;
2317             src2+=stride;
2318             src3+=stride;
2319             src4+=stride;
2320             dst +=stride;
2321         }
2322     }else{
2323         uint8_t *src1= hpel[l];
2324         uint8_t *src2= hpel[r];
2325         int a= weight[((dx&7) + (8*(dy&7)))];
2326         int b= 8-a;
2327         for(y=0; y < b_h; y++){
2328             for(x=0; x < b_w; x++){
2329                 dst[x]= (a*src1[x] + b*src2[x] + 4)>>3;
2330             }
2331             src1+=stride;
2332             src2+=stride;
2333             dst +=stride;
2334         }
2335     }
2336 STOP_TIMER("mc_block")
2337 }
2338
2339 #define mca(dx,dy,b_w)\
2340 static void mc_block_hpel ## dx ## dy ## b_w(uint8_t *dst, const uint8_t *src, int stride, int h){\
2341     uint8_t tmp[stride*(b_w+HTAPS_MAX-1)];\
2342     assert(h==b_w);\
2343     mc_block(NULL, dst, src-(HTAPS_MAX/2-1)-(HTAPS_MAX/2-1)*stride, tmp, stride, b_w, b_w, dx, dy);\
2344 }
2345
2346 mca( 0, 0,16)
2347 mca( 8, 0,16)
2348 mca( 0, 8,16)
2349 mca( 8, 8,16)
2350 mca( 0, 0,8)
2351 mca( 8, 0,8)
2352 mca( 0, 8,8)
2353 mca( 8, 8,8)
2354
2355 static void pred_block(SnowContext *s, uint8_t *dst, uint8_t *tmp, int stride, int sx, int sy, int b_w, int b_h, BlockNode *block, int plane_index, int w, int h){
2356     if(block->type & BLOCK_INTRA){
2357         int x, y;
2358         const int color = block->color[plane_index];
2359         const int color4= color*0x01010101;
2360         if(b_w==32){
2361             for(y=0; y < b_h; y++){
2362                 *(uint32_t*)&dst[0 + y*stride]= color4;
2363                 *(uint32_t*)&dst[4 + y*stride]= color4;
2364                 *(uint32_t*)&dst[8 + y*stride]= color4;
2365                 *(uint32_t*)&dst[12+ y*stride]= color4;
2366                 *(uint32_t*)&dst[16+ y*stride]= color4;
2367                 *(uint32_t*)&dst[20+ y*stride]= color4;
2368                 *(uint32_t*)&dst[24+ y*stride]= color4;
2369                 *(uint32_t*)&dst[28+ y*stride]= color4;
2370             }
2371         }else if(b_w==16){
2372             for(y=0; y < b_h; y++){
2373                 *(uint32_t*)&dst[0 + y*stride]= color4;
2374                 *(uint32_t*)&dst[4 + y*stride]= color4;
2375                 *(uint32_t*)&dst[8 + y*stride]= color4;
2376                 *(uint32_t*)&dst[12+ y*stride]= color4;
2377             }
2378         }else if(b_w==8){
2379             for(y=0; y < b_h; y++){
2380                 *(uint32_t*)&dst[0 + y*stride]= color4;
2381                 *(uint32_t*)&dst[4 + y*stride]= color4;
2382             }
2383         }else if(b_w==4){
2384             for(y=0; y < b_h; y++){
2385                 *(uint32_t*)&dst[0 + y*stride]= color4;
2386             }
2387         }else{
2388             for(y=0; y < b_h; y++){
2389                 for(x=0; x < b_w; x++){
2390                     dst[x + y*stride]= color;
2391                 }
2392             }
2393         }
2394     }else{
2395         uint8_t *src= s->last_picture[block->ref].data[plane_index];
2396         const int scale= plane_index ?  s->mv_scale : 2*s->mv_scale;
2397         int mx= block->mx*scale;
2398         int my= block->my*scale;
2399         const int dx= mx&15;
2400         const int dy= my&15;
2401         const int tab_index= 3 - (b_w>>2) + (b_w>>4);
2402         sx += (mx>>4) - (HTAPS_MAX/2-1);
2403         sy += (my>>4) - (HTAPS_MAX/2-1);
2404         src += sx + sy*stride;
2405         if(   (unsigned)sx >= w - b_w - (HTAPS_MAX-2)
2406            || (unsigned)sy >= h - b_h - (HTAPS_MAX-2)){
2407             ff_emulated_edge_mc(tmp + MB_SIZE, src, stride, b_w+HTAPS_MAX-1, b_h+HTAPS_MAX-1, sx, sy, w, h);
2408             src= tmp + MB_SIZE;
2409         }
2410 //        assert(b_w == b_h || 2*b_w == b_h || b_w == 2*b_h);
2411 //        assert(!(b_w&(b_w-1)));
2412         assert(b_w>1 && b_h>1);
2413         assert(tab_index>=0 && tab_index<4 || b_w==32);
2414         if((dx&3) || (dy&3) || !(b_w == b_h || 2*b_w == b_h || b_w == 2*b_h) || (b_w&(b_w-1)) || !s->plane[plane_index].fast_mc )
2415             mc_block(&s->plane[plane_index], dst, src, tmp, stride, b_w, b_h, dx, dy);
2416         else if(b_w==32){
2417             int y;
2418             for(y=0; y<b_h; y+=16){
2419                 s->dsp.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + y*stride, src + 3 + (y+3)*stride,stride);
2420                 s->dsp.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + 16 + y*stride, src + 19 + (y+3)*stride,stride);
2421             }
2422         }else if(b_w==b_h)
2423             s->dsp.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst,src + 3 + 3*stride,stride);
2424         else if(b_w==2*b_h){
2425             s->dsp.put_h264_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst    ,src + 3       + 3*stride,stride);
2426             s->dsp.put_h264_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst+b_h,src + 3 + b_h + 3*stride,stride);
2427         }else{
2428             assert(2*b_w==b_h);
2429             s->dsp.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst           ,src + 3 + 3*stride           ,stride);
2430             s->dsp.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst+b_w*stride,src + 3 + 3*stride+b_w*stride,stride);
2431         }
2432     }
2433 }
2434
2435 void ff_snow_inner_add_yblock(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
2436                               int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
2437     int y, x;
2438     IDWTELEM * dst;
2439     for(y=0; y<b_h; y++){
2440         //FIXME ugly misuse of obmc_stride
2441         const uint8_t *obmc1= obmc + y*obmc_stride;
2442         const uint8_t *obmc2= obmc1+ (obmc_stride>>1);
2443         const uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
2444         const uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2445         dst = slice_buffer_get_line(sb, src_y + y);
2446         for(x=0; x<b_w; x++){
2447             int v=   obmc1[x] * block[3][x + y*src_stride]
2448                     +obmc2[x] * block[2][x + y*src_stride]
2449                     +obmc3[x] * block[1][x + y*src_stride]
2450                     +obmc4[x] * block[0][x + y*src_stride];
2451
2452             v <<= 8 - LOG2_OBMC_MAX;
2453             if(FRAC_BITS != 8){
2454                 v >>= 8 - FRAC_BITS;
2455             }
2456             if(add){
2457                 v += dst[x + src_x];
2458                 v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
2459                 if(v&(~255)) v= ~(v>>31);
2460                 dst8[x + y*src_stride] = v;
2461             }else{
2462                 dst[x + src_x] -= v;
2463             }
2464         }
2465     }
2466 }
2467
2468 //FIXME name clenup (b_w, block_w, b_width stuff)
2469 static av_always_inline void add_yblock(SnowContext *s, int sliced, slice_buffer *sb, IDWTELEM *dst, uint8_t *dst8, const uint8_t *obmc, int src_x, int src_y, int b_w, int b_h, int w, int h, int dst_stride, int src_stride, int obmc_stride, int b_x, int b_y, int add, int offset_dst, int plane_index){
2470     const int b_width = s->b_width  << s->block_max_depth;
2471     const int b_height= s->b_height << s->block_max_depth;
2472     const int b_stride= b_width;
2473     BlockNode *lt= &s->block[b_x + b_y*b_stride];
2474     BlockNode *rt= lt+1;
2475     BlockNode *lb= lt+b_stride;
2476     BlockNode *rb= lb+1;
2477     uint8_t *block[4];
2478     int tmp_step= src_stride >= 7*MB_SIZE ? MB_SIZE : MB_SIZE*src_stride;
2479     uint8_t tmp[src_stride*7*MB_SIZE]; //FIXME align
2480     uint8_t *ptmp;
2481     int x,y;
2482
2483     if(b_x<0){
2484         lt= rt;
2485         lb= rb;
2486     }else if(b_x + 1 >= b_width){
2487         rt= lt;
2488         rb= lb;
2489     }
2490     if(b_y<0){
2491         lt= lb;
2492         rt= rb;
2493     }else if(b_y + 1 >= b_height){
2494         lb= lt;
2495         rb= rt;
2496     }
2497
2498     if(src_x<0){ //FIXME merge with prev & always round internal width upto *16
2499         obmc -= src_x;
2500         b_w += src_x;
2501         if(!sliced && !offset_dst)
2502             dst -= src_x;
2503         src_x=0;
2504     }else if(src_x + b_w > w){
2505         b_w = w - src_x;
2506     }
2507     if(src_y<0){
2508         obmc -= src_y*obmc_stride;
2509         b_h += src_y;
2510         if(!sliced && !offset_dst)
2511             dst -= src_y*dst_stride;
2512         src_y=0;
2513     }else if(src_y + b_h> h){
2514         b_h = h - src_y;
2515     }
2516
2517     if(b_w<=0 || b_h<=0) return;
2518
2519 assert(src_stride > 2*MB_SIZE + 5);
2520     if(!sliced && offset_dst)
2521         dst += src_x + src_y*dst_stride;
2522     dst8+= src_x + src_y*src_stride;
2523 //    src += src_x + src_y*src_stride;
2524
2525     ptmp= tmp + 3*tmp_step;
2526     block[0]= ptmp;
2527     ptmp+=tmp_step;
2528     pred_block(s, block[0], tmp, src_stride, src_x, src_y, b_w, b_h, lt, plane_index, w, h);
2529
2530     if(same_block(lt, rt)){
2531         block[1]= block[0];
2532     }else{
2533         block[1]= ptmp;
2534         ptmp+=tmp_step;
2535         pred_block(s, block[1], tmp, src_stride, src_x, src_y, b_w, b_h, rt, plane_index, w, h);
2536     }
2537
2538     if(same_block(lt, lb)){
2539         block[2]= block[0];
2540     }else if(same_block(rt, lb)){
2541         block[2]= block[1];
2542     }else{
2543         block[2]= ptmp;
2544         ptmp+=tmp_step;
2545         pred_block(s, block[2], tmp, src_stride, src_x, src_y, b_w, b_h, lb, plane_index, w, h);
2546     }
2547
2548     if(same_block(lt, rb) ){
2549         block[3]= block[0];
2550     }else if(same_block(rt, rb)){
2551         block[3]= block[1];
2552     }else if(same_block(lb, rb)){
2553         block[3]= block[2];
2554     }else{
2555         block[3]= ptmp;
2556         pred_block(s, block[3], tmp, src_stride, src_x, src_y, b_w, b_h, rb, plane_index, w, h);
2557     }
2558 #if 0
2559     for(y=0; y<b_h; y++){
2560         for(x=0; x<b_w; x++){
2561             int v=   obmc [x + y*obmc_stride] * block[3][x + y*src_stride] * (256/OBMC_MAX);
2562             if(add) dst[x + y*dst_stride] += v;
2563             else    dst[x + y*dst_stride] -= v;
2564         }
2565     }
2566     for(y=0; y<b_h; y++){
2567         uint8_t *obmc2= obmc + (obmc_stride>>1);
2568         for(x=0; x<b_w; x++){
2569             int v=   obmc2[x + y*obmc_stride] * block[2][x + y*src_stride] * (256/OBMC_MAX);
2570             if(add) dst[x + y*dst_stride] += v;
2571             else    dst[x + y*dst_stride] -= v;
2572         }
2573     }
2574     for(y=0; y<b_h; y++){
2575         uint8_t *obmc3= obmc + obmc_stride*(obmc_stride>>1);
2576         for(x=0; x<b_w; x++){
2577             int v=   obmc3[x + y*obmc_stride] * block[1][x + y*src_stride] * (256/OBMC_MAX);
2578             if(add) dst[x + y*dst_stride] += v;
2579             else    dst[x + y*dst_stride] -= v;
2580         }
2581     }
2582     for(y=0; y<b_h; y++){
2583         uint8_t *obmc3= obmc + obmc_stride*(obmc_stride>>1);
2584         uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2585         for(x=0; x<b_w; x++){
2586             int v=   obmc4[x + y*obmc_stride] * block[0][x + y*src_stride] * (256/OBMC_MAX);
2587             if(add) dst[x + y*dst_stride] += v;
2588             else    dst[x + y*dst_stride] -= v;
2589         }
2590     }
2591 #else
2592     if(sliced){
2593         START_TIMER
2594
2595         s->dsp.inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
2596         STOP_TIMER("inner_add_yblock")
2597     }else
2598     for(y=0; y<b_h; y++){
2599         //FIXME ugly misuse of obmc_stride
2600         const uint8_t *obmc1= obmc + y*obmc_stride;
2601         const uint8_t *obmc2= obmc1+ (obmc_stride>>1);
2602         const uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
2603         const uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2604         for(x=0; x<b_w; x++){
2605             int v=   obmc1[x] * block[3][x + y*src_stride]
2606                     +obmc2[x] * block[2][x + y*src_stride]
2607                     +obmc3[x] * block[1][x + y*src_stride]
2608                     +obmc4[x] * block[0][x + y*src_stride];
2609
2610             v <<= 8 - LOG2_OBMC_MAX;
2611             if(FRAC_BITS != 8){
2612                 v >>= 8 - FRAC_BITS;
2613             }
2614             if(add){
2615                 v += dst[x + y*dst_stride];
2616                 v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
2617                 if(v&(~255)) v= ~(v>>31);
2618                 dst8[x + y*src_stride] = v;
2619             }else{
2620                 dst[x + y*dst_stride] -= v;
2621             }
2622         }
2623     }
2624 #endif /* 0 */
2625 }
2626
2627 static av_always_inline void predict_slice_buffered(SnowContext *s, slice_buffer * sb, IDWTELEM * old_buffer, int plane_index, int add, int mb_y){
2628     Plane *p= &s->plane[plane_index];
2629     const int mb_w= s->b_width  << s->block_max_depth;
2630     const int mb_h= s->b_height << s->block_max_depth;
2631     int x, y, mb_x;
2632     int block_size = MB_SIZE >> s->block_max_depth;
2633     int block_w    = plane_index ? block_size/2 : block_size;
2634     const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2635     int obmc_stride= plane_index ? block_size : 2*block_size;
2636     int ref_stride= s->current_picture.linesize[plane_index];
2637     uint8_t *dst8= s->current_picture.data[plane_index];
2638     int w= p->width;
2639     int h= p->height;
2640     START_TIMER
2641
2642     if(s->keyframe || (s->avctx->debug&512)){
2643         if(mb_y==mb_h)
2644             return;
2645
2646         if(add){
2647             for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++)
2648             {
2649 //                DWTELEM * line = slice_buffer_get_line(sb, y);
2650                 IDWTELEM * line = sb->line[y];
2651                 for(x=0; x<w; x++)
2652                 {
2653 //                    int v= buf[x + y*w] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
2654                     int v= line[x] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
2655                     v >>= FRAC_BITS;
2656                     if(v&(~255)) v= ~(v>>31);
2657                     dst8[x + y*ref_stride]= v;
2658                 }
2659             }
2660         }else{
2661             for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++)
2662             {
2663 //                DWTELEM * line = slice_buffer_get_line(sb, y);
2664                 IDWTELEM * line = sb->line[y];
2665                 for(x=0; x<w; x++)
2666                 {
2667                     line[x] -= 128 << FRAC_BITS;
2668 //                    buf[x + y*w]-= 128<<FRAC_BITS;
2669                 }
2670             }
2671         }
2672
2673         return;
2674     }
2675
2676         for(mb_x=0; mb_x<=mb_w; mb_x++){
2677             START_TIMER
2678
2679             add_yblock(s, 1, sb, old_buffer, dst8, obmc,
2680                        block_w*mb_x - block_w/2,
2681                        block_w*mb_y - block_w/2,
2682                        block_w, block_w,
2683                        w, h,
2684                        w, ref_stride, obmc_stride,
2685                        mb_x - 1, mb_y - 1,
2686                        add, 0, plane_index);
2687
2688             STOP_TIMER("add_yblock")
2689         }
2690
2691     STOP_TIMER("predict_slice")
2692 }
2693
2694 static av_always_inline void predict_slice(SnowContext *s, IDWTELEM *buf, int plane_index, int add, int mb_y){
2695     Plane *p= &s->plane[plane_index];
2696     const int mb_w= s->b_width  << s->block_max_depth;
2697     const int mb_h= s->b_height << s->block_max_depth;
2698     int x, y, mb_x;
2699     int block_size = MB_SIZE >> s->block_max_depth;
2700     int block_w    = plane_index ? block_size/2 : block_size;
2701     const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2702     const int obmc_stride= plane_index ? block_size : 2*block_size;
2703     int ref_stride= s->current_picture.linesize[plane_index];
2704     uint8_t *dst8= s->current_picture.data[plane_index];
2705     int w= p->width;
2706     int h= p->height;
2707     START_TIMER
2708
2709     if(s->keyframe || (s->avctx->debug&512)){
2710         if(mb_y==mb_h)
2711             return;
2712
2713         if(add){
2714             for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++){
2715                 for(x=0; x<w; x++){
2716                     int v= buf[x + y*w] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
2717                     v >>= FRAC_BITS;
2718                     if(v&(~255)) v= ~(v>>31);
2719                     dst8[x + y*ref_stride]= v;
2720                 }
2721             }
2722         }else{
2723             for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++){
2724                 for(x=0; x<w; x++){
2725                     buf[x + y*w]-= 128<<FRAC_BITS;
2726                 }
2727             }
2728         }
2729
2730         return;
2731     }
2732
2733         for(mb_x=0; mb_x<=mb_w; mb_x++){
2734             START_TIMER
2735
2736             add_yblock(s, 0, NULL, buf, dst8, obmc,
2737                        block_w*mb_x - block_w/2,
2738                        block_w*mb_y - block_w/2,
2739                        block_w, block_w,
2740                        w, h,
2741                        w, ref_stride, obmc_stride,
2742                        mb_x - 1, mb_y - 1,
2743                        add, 1, plane_index);
2744
2745             STOP_TIMER("add_yblock")
2746         }
2747
2748     STOP_TIMER("predict_slice")
2749 }
2750
2751 static av_always_inline void predict_plane(SnowContext *s, IDWTELEM *buf, int plane_index, int add){
2752     const int mb_h= s->b_height << s->block_max_depth;
2753     int mb_y;
2754     for(mb_y=0; mb_y<=mb_h; mb_y++)
2755         predict_slice(s, buf, plane_index, add, mb_y);
2756 }
2757
2758 static int get_dc(SnowContext *s, int mb_x, int mb_y, int plane_index){
2759     int i, x2, y2;
2760     Plane *p= &s->plane[plane_index];
2761     const int block_size = MB_SIZE >> s->block_max_depth;
2762     const int block_w    = plane_index ? block_size/2 : block_size;
2763     const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2764     const int obmc_stride= plane_index ? block_size : 2*block_size;
2765     const int ref_stride= s->current_picture.linesize[plane_index];
2766     uint8_t *src= s-> input_picture.data[plane_index];
2767     IDWTELEM *dst= (IDWTELEM*)s->m.obmc_scratchpad + plane_index*block_size*block_size*4; //FIXME change to unsigned
2768     const int b_stride = s->b_width << s->block_max_depth;
2769     const int w= p->width;
2770     const int h= p->height;
2771     int index= mb_x + mb_y*b_stride;
2772     BlockNode *b= &s->block[index];
2773     BlockNode backup= *b;
2774     int ab=0;
2775     int aa=0;
2776
2777     b->type|= BLOCK_INTRA;
2778     b->color[plane_index]= 0;
2779     memset(dst, 0, obmc_stride*obmc_stride*sizeof(IDWTELEM));
2780
2781     for(i=0; i<4; i++){
2782         int mb_x2= mb_x + (i &1) - 1;
2783         int mb_y2= mb_y + (i>>1) - 1;
2784         int x= block_w*mb_x2 + block_w/2;
2785         int y= block_w*mb_y2 + block_w/2;
2786
2787         add_yblock(s, 0, NULL, dst + ((i&1)+(i>>1)*obmc_stride)*block_w, NULL, obmc,
2788                     x, y, block_w, block_w, w, h, obmc_stride, ref_stride, obmc_stride, mb_x2, mb_y2, 0, 0, plane_index);
2789
2790         for(y2= FFMAX(y, 0); y2<FFMIN(h, y+block_w); y2++){
2791             for(x2= FFMAX(x, 0); x2<FFMIN(w, x+block_w); x2++){
2792                 int index= x2-(block_w*mb_x - block_w/2) + (y2-(block_w*mb_y - block_w/2))*obmc_stride;
2793                 int obmc_v= obmc[index];
2794                 int d;
2795                 if(y<0) obmc_v += obmc[index + block_w*obmc_stride];
2796                 if(x<0) obmc_v += obmc[index + block_w];
2797                 if(y+block_w>h) obmc_v += obmc[index - block_w*obmc_stride];
2798                 if(x+block_w>w) obmc_v += obmc[index - block_w];
2799                 //FIXME precalc this or simplify it somehow else
2800
2801                 d = -dst[index] + (1<<(FRAC_BITS-1));
2802                 dst[index] = d;
2803                 ab += (src[x2 + y2*ref_stride] - (d>>FRAC_BITS)) * obmc_v;
2804                 aa += obmc_v * obmc_v; //FIXME precalclate this
2805             }
2806         }
2807     }
2808     *b= backup;
2809
2810     return av_clip(((ab<<LOG2_OBMC_MAX) + aa/2)/aa, 0, 255); //FIXME we should not need clipping
2811 }
2812
2813 static inline int get_block_bits(SnowContext *s, int x, int y, int w){
2814     const int b_stride = s->b_width << s->block_max_depth;
2815     const int b_height = s->b_height<< s->block_max_depth;
2816     int index= x + y*b_stride;
2817     const BlockNode *b     = &s->block[index];
2818     const BlockNode *left  = x ? &s->block[index-1] : &null_block;
2819     const BlockNode *top   = y ? &s->block[index-b_stride] : &null_block;
2820     const BlockNode *tl    = y && x ? &s->block[index-b_stride-1] : left;
2821     const BlockNode *tr    = y && x+w<b_stride ? &s->block[index-b_stride+w] : tl;
2822     int dmx, dmy;
2823 //  int mx_context= av_log2(2*FFABS(left->mx - top->mx));
2824 //  int my_context= av_log2(2*FFABS(left->my - top->my));
2825
2826     if(x<0 || x>=b_stride || y>=b_height)
2827         return 0;
2828 /*
2829 1            0      0
2830 01X          1-2    1
2831 001XX        3-6    2-3
2832 0001XXX      7-14   4-7
2833 00001XXXX   15-30   8-15
2834 */
2835 //FIXME try accurate rate
2836 //FIXME intra and inter predictors if surrounding blocks arent the same type
2837     if(b->type & BLOCK_INTRA){
2838         return 3+2*( av_log2(2*FFABS(left->color[0] - b->color[0]))
2839                    + av_log2(2*FFABS(left->color[1] - b->color[1]))
2840                    + av_log2(2*FFABS(left->color[2] - b->color[2])));
2841     }else{
2842         pred_mv(s, &dmx, &dmy, b->ref, left, top, tr);
2843         dmx-= b->mx;
2844         dmy-= b->my;
2845         return 2*(1 + av_log2(2*FFABS(dmx)) //FIXME kill the 2* can be merged in lambda
2846                     + av_log2(2*FFABS(dmy))
2847                     + av_log2(2*b->ref));
2848     }
2849 }
2850
2851 static int get_block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index, const uint8_t *obmc_edged){
2852     Plane *p= &s->plane[plane_index];
2853     const int block_size = MB_SIZE >> s->block_max_depth;
2854     const int block_w    = plane_index ? block_size/2 : block_size;
2855     const int obmc_stride= plane_index ? block_size : 2*block_size;
2856     const int ref_stride= s->current_picture.linesize[plane_index];
2857     uint8_t *dst= s->current_picture.data[plane_index];
2858     uint8_t *src= s->  input_picture.data[plane_index];
2859     IDWTELEM *pred= (IDWTELEM*)s->m.obmc_scratchpad + plane_index*block_size*block_size*4;
2860     uint8_t cur[ref_stride*2*MB_SIZE]; //FIXME alignment
2861     uint8_t tmp[ref_stride*(2*MB_SIZE+HTAPS_MAX-1)];
2862     const int b_stride = s->b_width << s->block_max_depth;
2863     const int b_height = s->b_height<< s->block_max_depth;
2864     const int w= p->width;
2865     const int h= p->height;
2866     int distortion;
2867     int rate= 0;
2868     const int penalty_factor= get_penalty_factor(s->lambda, s->lambda2, s->avctx->me_cmp);
2869     int sx= block_w*mb_x - block_w/2;
2870     int sy= block_w*mb_y - block_w/2;
2871     int x0= FFMAX(0,-sx);
2872     int y0= FFMAX(0,-sy);
2873     int x1= FFMIN(block_w*2, w-sx);
2874     int y1= FFMIN(block_w*2, h-sy);
2875     int i,x,y;
2876
2877     pred_block(s, cur, tmp, ref_stride, sx, sy, block_w*2, block_w*2, &s->block[mb_x + mb_y*b_stride], plane_index, w, h);
2878
2879     for(y=y0; y<y1; y++){
2880         const uint8_t *obmc1= obmc_edged + y*obmc_stride;
2881         const IDWTELEM *pred1 = pred + y*obmc_stride;
2882         uint8_t *cur1 = cur + y*ref_stride;
2883         uint8_t *dst1 = dst + sx + (sy+y)*ref_stride;
2884         for(x=x0; x<x1; x++){
2885 #if FRAC_BITS >= LOG2_OBMC_MAX
2886             int v = (cur1[x] * obmc1[x]) << (FRAC_BITS - LOG2_OBMC_MAX);
2887 #else
2888             int v = (cur1[x] * obmc1[x] + (1<<(LOG2_OBMC_MAX - FRAC_BITS-1))) >> (LOG2_OBMC_MAX - FRAC_BITS);
2889 #endif
2890             v = (v + pred1[x]) >> FRAC_BITS;
2891             if(v&(~255)) v= ~(v>>31);
2892             dst1[x] = v;
2893         }
2894     }
2895
2896     /* copy the regions where obmc[] = (uint8_t)256 */
2897     if(LOG2_OBMC_MAX == 8
2898         && (mb_x == 0 || mb_x == b_stride-1)
2899         && (mb_y == 0 || mb_y == b_height-1)){
2900         if(mb_x == 0)
2901             x1 = block_w;
2902         else
2903             x0 = block_w;
2904         if(mb_y == 0)
2905             y1 = block_w;
2906         else
2907             y0 = block_w;
2908         for(y=y0; y<y1; y++)
2909             memcpy(dst + sx+x0 + (sy+y)*ref_stride, cur + x0 + y*ref_stride, x1-x0);
2910     }
2911
2912     if(block_w==16){
2913         /* FIXME rearrange dsputil to fit 32x32 cmp functions */
2914         /* FIXME check alignment of the cmp wavelet vs the encoding wavelet */
2915         /* FIXME cmps overlap but don't cover the wavelet's whole support,
2916          * so improving the score of one block is not strictly guaranteed to
2917          * improve the score of the whole frame, so iterative motion est
2918          * doesn't always converge. */
2919         if(s->avctx->me_cmp == FF_CMP_W97)
2920             distortion = w97_32_c(&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, 32);
2921         else if(s->avctx->me_cmp == FF_CMP_W53)
2922             distortion = w53_32_c(&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, 32);
2923         else{
2924             distortion = 0;
2925             for(i=0; i<4; i++){
2926                 int off = sx+16*(i&1) + (sy+16*(i>>1))*ref_stride;
2927                 distortion += s->dsp.me_cmp[0](&s->m, src + off, dst + off, ref_stride, 16);
2928             }
2929         }
2930     }else{
2931         assert(block_w==8);
2932         distortion = s->dsp.me_cmp[0](&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, block_w*2);
2933     }
2934
2935     if(plane_index==0){
2936         for(i=0; i<4; i++){
2937 /* ..RRr
2938  * .RXx.
2939  * rxx..
2940  */
2941             rate += get_block_bits(s, mb_x + (i&1) - (i>>1), mb_y + (i>>1), 1);
2942         }
2943         if(mb_x == b_stride-2)
2944             rate += get_block_bits(s, mb_x + 1, mb_y + 1, 1);
2945     }
2946     return distortion + rate*penalty_factor;
2947 }
2948
2949 static int get_4block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index){
2950     int i, y2;
2951     Plane *p= &s->plane[plane_index];
2952     const int block_size = MB_SIZE >> s->block_max_depth;
2953     const int block_w    = plane_index ? block_size/2 : block_size;
2954     const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2955     const int obmc_stride= plane_index ? block_size : 2*block_size;
2956     const int ref_stride= s->current_picture.linesize[plane_index];
2957     uint8_t *dst= s->current_picture.data[plane_index];
2958     uint8_t *src= s-> input_picture.data[plane_index];
2959     static const IDWTELEM zero_dst[4096]; //FIXME
2960     const int b_stride = s->b_width << s->block_max_depth;
2961     const int w= p->width;
2962     const int h= p->height;
2963     int distortion= 0;
2964     int rate= 0;
2965     const int penalty_factor= get_penalty_factor(s->lambda, s->lambda2, s->avctx->me_cmp);
2966
2967     for(i=0; i<9; i++){
2968         int mb_x2= mb_x + (i%3) - 1;
2969         int mb_y2= mb_y + (i/3) - 1;
2970         int x= block_w*mb_x2 + block_w/2;
2971         int y= block_w*mb_y2 + block_w/2;
2972
2973         add_yblock(s, 0, NULL, zero_dst, dst, obmc,
2974                    x, y, block_w, block_w, w, h, /*dst_stride*/0, ref_stride, obmc_stride, mb_x2, mb_y2, 1, 1, plane_index);
2975
2976         //FIXME find a cleaner/simpler way to skip the outside stuff
2977         for(y2= y; y2<0; y2++)
2978             memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, block_w);
2979         for(y2= h; y2<y+block_w; y2++)
2980             memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, block_w);
2981         if(x<0){
2982             for(y2= y; y2<y+block_w; y2++)
2983                 memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, -x);
2984         }
2985         if(x+block_w > w){
2986             for(y2= y; y2<y+block_w; y2++)
2987                 memcpy(dst + w + y2*ref_stride, src + w + y2*ref_stride, x+block_w - w);
2988         }
2989
2990         assert(block_w== 8 || block_w==16);
2991         distortion += s->dsp.me_cmp[block_w==8](&s->m, src + x + y*ref_stride, dst + x + y*ref_stride, ref_stride, block_w);
2992     }
2993
2994     if(plane_index==0){
2995         BlockNode *b= &s->block[mb_x+mb_y*b_stride];
2996         int merged= same_block(b,b+1) && same_block(b,b+b_stride) && same_block(b,b+b_stride+1);
2997
2998 /* ..RRRr
2999  * .RXXx.
3000  * .RXXx.
3001  * rxxx.
3002  */
3003         if(merged)
3004             rate = get_block_bits(s, mb_x, mb_y, 2);
3005         for(i=merged?4:0; i<9; i++){
3006             static const int dxy[9][2] = {{0,0},{1,0},{0,1},{1,1},{2,0},{2,1},{-1,2},{0,2},{1,2}};
3007             rate += get_block_bits(s, mb_x + dxy[i][0], mb_y + dxy[i][1], 1);
3008         }
3009     }
3010     return distortion + rate*penalty_factor;
3011 }
3012
3013 static av_always_inline int check_block(SnowContext *s, int mb_x, int mb_y, int p[3], int intra, const uint8_t *obmc_edged, int *best_rd){
3014     const int b_stride= s->b_width << s->block_max_depth;
3015     BlockNode *block= &s->block[mb_x + mb_y * b_stride];
3016     BlockNode backup= *block;
3017     int rd, index, value;
3018
3019     assert(mb_x>=0 && mb_y>=0);
3020     assert(mb_x<b_stride);
3021
3022     if(intra){
3023         block->color[0] = p[0];
3024         block->color[1] = p[1];
3025         block->color[2] = p[2];
3026         block->type |= BLOCK_INTRA;
3027     }else{
3028         index= (p[0] + 31*p[1]) & (ME_CACHE_SIZE-1);
3029         value= s->me_cache_generation + (p[0]>>10) + (p[1]<<6) + (block->ref<<12);
3030         if(s->me_cache[index] == value)
3031             return 0;
3032         s->me_cache[index]= value;
3033
3034         block->mx= p[0];
3035         block->my= p[1];
3036         block->type &= ~BLOCK_INTRA;
3037     }
3038
3039     rd= get_block_rd(s, mb_x, mb_y, 0, obmc_edged);
3040
3041 //FIXME chroma
3042     if(rd < *best_rd){
3043         *best_rd= rd;
3044         return 1;
3045     }else{
3046         *block= backup;
3047         return 0;
3048     }
3049 }
3050
3051 /* special case for int[2] args we discard afterward, fixes compilation prob with gcc 2.95 */
3052 static av_always_inline int check_block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, const uint8_t *obmc_edged, int *best_rd){
3053     int p[2] = {p0, p1};
3054     return check_block(s, mb_x, mb_y, p, 0, obmc_edged, best_rd);
3055 }
3056
3057 static av_always_inline int check_4block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, int ref, int *best_rd){
3058     const int b_stride= s->b_width << s->block_max_depth;
3059     BlockNode *block= &s->block[mb_x + mb_y * b_stride];
3060     BlockNode backup[4]= {block[0], block[1], block[b_stride], block[b_stride+1]};
3061     int rd, index, value;
3062
3063     assert(mb_x>=0 && mb_y>=0);
3064     assert(mb_x<b_stride);
3065     assert(((mb_x|mb_y)&1) == 0);
3066
3067     index= (p0 + 31*p1) & (ME_CACHE_SIZE-1);
3068     value= s->me_cache_generation + (p0>>10) + (p1<<6) + (block->ref<<12);
3069     if(s->me_cache[index] == value)
3070         return 0;
3071     s->me_cache[index]= value;
3072
3073     block->mx= p0;
3074     block->my= p1;
3075     block->ref= ref;
3076     block->type &= ~BLOCK_INTRA;
3077     block[1]= block[b_stride]= block[b_stride+1]= *block;
3078
3079     rd= get_4block_rd(s, mb_x, mb_y, 0);
3080
3081 //FIXME chroma
3082     if(rd < *best_rd){
3083         *best_rd= rd;
3084         return 1;
3085     }else{
3086         block[0]= backup[0];
3087         block[1]= backup[1];
3088         block[b_stride]= backup[2];
3089         block[b_stride+1]= backup[3];
3090         return 0;
3091     }
3092 }
3093
3094 static void iterative_me(SnowContext *s){
3095     int pass, mb_x, mb_y;
3096     const int b_width = s->b_width  << s->block_max_depth;
3097     const int b_height= s->b_height << s->block_max_depth;
3098     const int b_stride= b_width;
3099     int color[3];
3100
3101     {
3102         RangeCoder r = s->c;
3103         uint8_t state[sizeof(s->block_state)];
3104         memcpy(state, s->block_state, sizeof(s->block_state));
3105         for(mb_y= 0; mb_y<s->b_height; mb_y++)
3106             for(mb_x= 0; mb_x<s->b_width; mb_x++)
3107                 encode_q_branch(s, 0, mb_x, mb_y);
3108         s->c = r;
3109         memcpy(s->block_state, state, sizeof(s->block_state));
3110     }
3111
3112     for(pass=0; pass<25; pass++){
3113         int change= 0;
3114
3115         for(mb_y= 0; mb_y<b_height; mb_y++){
3116             for(mb_x= 0; mb_x<b_width; mb_x++){
3117                 int dia_change, i, j, ref;
3118                 int best_rd= INT_MAX, ref_rd;
3119                 BlockNode backup, ref_b;
3120                 const int index= mb_x + mb_y * b_stride;
3121                 BlockNode *block= &s->block[index];
3122                 BlockNode *tb =                   mb_y            ? &s->block[index-b_stride  ] : NULL;
3123                 BlockNode *lb = mb_x                              ? &s->block[index         -1] : NULL;
3124                 BlockNode *rb = mb_x+1<b_width                    ? &s->block[index         +1] : NULL;
3125                 BlockNode *bb =                   mb_y+1<b_height ? &s->block[index+b_stride  ] : NULL;
3126                 BlockNode *tlb= mb_x           && mb_y            ? &s->block[index-b_stride-1] : NULL;
3127                 BlockNode *trb= mb_x+1<b_width && mb_y            ? &s->block[index-b_stride+1] : NULL;
3128                 BlockNode *blb= mb_x           && mb_y+1<b_height ? &s->block[index+b_stride-1] : NULL;
3129                 BlockNode *brb= mb_x+1<b_width && mb_y+1<b_height ? &s->block[index+b_stride+1] : NULL;
3130                 const int b_w= (MB_SIZE >> s->block_max_depth);
3131                 uint8_t obmc_edged[b_w*2][b_w*2];
3132
3133                 if(pass && (block->type & BLOCK_OPT))
3134                     continue;
3135                 block->type |= BLOCK_OPT;
3136
3137                 backup= *block;
3138
3139                 if(!s->me_cache_generation)
3140                     memset(s->me_cache, 0, sizeof(s->me_cache));
3141                 s->me_cache_generation += 1<<22;
3142
3143                 //FIXME precalc
3144                 {
3145                     int x, y;
3146                     memcpy(obmc_edged, obmc_tab[s->block_max_depth], b_w*b_w*4);
3147                     if(mb_x==0)
3148                         for(y=0; y<b_w*2; y++)
3149                             memset(obmc_edged[y], obmc_edged[y][0] + obmc_edged[y][b_w-1], b_w);
3150                     if(mb_x==b_stride-1)
3151                         for(y=0; y<b_w*2; y++)
3152                             memset(obmc_edged[y]+b_w, obmc_edged[y][b_w] + obmc_edged[y][b_w*2-1], b_w);
3153                     if(mb_y==0){
3154                         for(x=0; x<b_w*2; x++)
3155                             obmc_edged[0][x] += obmc_edged[b_w-1][x];
3156                         for(y=1; y<b_w; y++)
3157                             memcpy(obmc_edged[y], obmc_edged[0], b_w*2);
3158                     }
3159                     if(mb_y==b_height-1){
3160                         for(x=0; x<b_w*2; x++)
3161                             obmc_edged[b_w*2-1][x] += obmc_edged[b_w][x];
3162                         for(y=b_w; y<b_w*2-1; y++)
3163                             memcpy(obmc_edged[y], obmc_edged[b_w*2-1], b_w*2);
3164                     }
3165                 }
3166
3167                 //skip stuff outside the picture
3168                 if(mb_x==0 || mb_y==0 || mb_x==b_width-1 || mb_y==b_height-1)
3169                 {
3170                     uint8_t *src= s->  input_picture.data[0];
3171                     uint8_t *dst= s->current_picture.data[0];
3172                     const int stride= s->current_picture.linesize[0];
3173                     const int block_w= MB_SIZE >> s->block_max_depth;
3174                     const int sx= block_w*mb_x - block_w/2;
3175                     const int sy= block_w*mb_y - block_w/2;
3176                     const int w= s->plane[0].width;
3177                     const int h= s->plane[0].height;
3178                     int y;
3179
3180                     for(y=sy; y<0; y++)
3181                         memcpy(dst + sx + y*stride, src + sx + y*stride, block_w*2);
3182                     for(y=h; y<sy+block_w*2; y++)
3183                         memcpy(dst + sx + y*stride, src + sx + y*stride, block_w*2);
3184                     if(sx<0){
3185                         for(y=sy; y<sy+block_w*2; y++)
3186                             memcpy(dst + sx + y*stride, src + sx + y*stride, -sx);
3187                     }
3188                     if(sx+block_w*2 > w){
3189                         for(y=sy; y<sy+block_w*2; y++)
3190                             memcpy(dst + w + y*stride, src + w + y*stride, sx+block_w*2 - w);
3191                     }
3192                 }
3193
3194                 // intra(black) = neighbors' contribution to the current block
3195                 for(i=0; i<3; i++)
3196                     color[i]= get_dc(s, mb_x, mb_y, i);
3197
3198                 // get previous score (cannot be cached due to OBMC)
3199                 if(pass > 0 && (block->type&BLOCK_INTRA)){
3200                     int color0[3]= {block->color[0], block->color[1], block->color[2]};
3201                     check_block(s, mb_x, mb_y, color0, 1, *obmc_edged, &best_rd);
3202                 }else
3203                     check_block_inter(s, mb_x, mb_y, block->mx, block->my, *obmc_edged, &best_rd);
3204
3205                 ref_b= *block;
3206                 ref_rd= best_rd;
3207                 for(ref=0; ref < s->ref_frames; ref++){
3208                     int16_t (*mvr)[2]= &s->ref_mvs[ref][index];
3209                     if(s->ref_scores[ref][index] > s->ref_scores[ref_b.ref][index]*3/2) //FIXME tune threshold
3210                         continue;
3211                     block->ref= ref;
3212                     best_rd= INT_MAX;
3213
3214                     check_block_inter(s, mb_x, mb_y, mvr[0][0], mvr[0][1], *obmc_edged, &best_rd);
3215                     check_block_inter(s, mb_x, mb_y, 0, 0, *obmc_edged, &best_rd);
3216                     if(tb)
3217                         check_block_inter(s, mb_x, mb_y, mvr[-b_stride][0], mvr[-b_stride][1], *obmc_edged, &best_rd);
3218                     if(lb)
3219                         check_block_inter(s, mb_x, mb_y, mvr[-1][0], mvr[-1][1], *obmc_edged, &best_rd);
3220                     if(rb)
3221                         check_block_inter(s, mb_x, mb_y, mvr[1][0], mvr[1][1], *obmc_edged, &best_rd);
3222                     if(bb)
3223                         check_block_inter(s, mb_x, mb_y, mvr[b_stride][0], mvr[b_stride][1], *obmc_edged, &best_rd);
3224
3225                     /* fullpel ME */
3226                     //FIXME avoid subpel interpol / round to nearest integer
3227                     do{
3228                         dia_change=0;
3229                         for(i=0; i<FFMAX(s->avctx->dia_size, 1); i++){
3230                             for(j=0; j<i; j++){
3231                                 dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+4*(i-j), block->my+(4*j), *obmc_edged, &best_rd);
3232                                 dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my-(4*j), *obmc_edged, &best_rd);
3233                                 dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+4*(i-j), block->my-(4*j), *obmc_edged, &best_rd);
3234                                 dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my+(4*j), *obmc_edged, &best_rd);
3235                             }
3236                         }
3237                     }while(dia_change);
3238                     /* subpel ME */
3239                     do{
3240                         static const int square[8][2]= {{+1, 0},{-1, 0},{ 0,+1},{ 0,-1},{+1,+1},{-1,-1},{+1,-1},{-1,+1},};
3241                         dia_change=0;
3242                         for(i=0; i<8; i++)
3243                             dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+square[i][0], block->my+square[i][1], *obmc_edged, &best_rd);
3244                     }while(dia_change);
3245                     //FIXME or try the standard 2 pass qpel or similar
3246
3247                     mvr[0][0]= block->mx;
3248                     mvr[0][1]= block->my;
3249                     if(ref_rd > best_rd){
3250                         ref_rd= best_rd;
3251                         ref_b= *block;
3252                     }
3253                 }
3254                 best_rd= ref_rd;
3255                 *block= ref_b;
3256 #if 1
3257                 check_block(s, mb_x, mb_y, color, 1, *obmc_edged, &best_rd);
3258                 //FIXME RD style color selection
3259 #endif
3260                 if(!same_block(block, &backup)){
3261                     if(tb ) tb ->type &= ~BLOCK_OPT;
3262                     if(lb ) lb ->type &= ~BLOCK_OPT;
3263                     if(rb ) rb ->type &= ~BLOCK_OPT;
3264                     if(bb ) bb ->type &= ~BLOCK_OPT;
3265                     if(tlb) tlb->type &= ~BLOCK_OPT;
3266                     if(trb) trb->type &= ~BLOCK_OPT;
3267                     if(blb) blb->type &= ~BLOCK_OPT;
3268                     if(brb) brb->type &= ~BLOCK_OPT;
3269                     change ++;
3270                 }
3271             }
3272         }
3273         av_log(NULL, AV_LOG_ERROR, "pass:%d changed:%d\n", pass, change);
3274         if(!change)
3275             break;
3276     }
3277
3278     if(s->block_max_depth == 1){
3279         int change= 0;
3280         for(mb_y= 0; mb_y<b_height; mb_y+=2){
3281             for(mb_x= 0; mb_x<b_width; mb_x+=2){
3282                 int i;
3283                 int best_rd, init_rd;
3284                 const int index= mb_x + mb_y * b_stride;
3285                 BlockNode *b[4];
3286
3287                 b[0]= &s->block[index];
3288                 b[1]= b[0]+1;
3289                 b[2]= b[0]+b_stride;
3290                 b[3]= b[2]+1;
3291                 if(same_block(b[0], b[1]) &&
3292                    same_block(b[0], b[2]) &&
3293                    same_block(b[0], b[3]))
3294                     continue;
3295
3296                 if(!s->me_cache_generation)
3297                     memset(s->me_cache, 0, sizeof(s->me_cache));
3298                 s->me_cache_generation += 1<<22;
3299
3300                 init_rd= best_rd= get_4block_rd(s, mb_x, mb_y, 0);
3301
3302                 //FIXME more multiref search?
3303                 check_4block_inter(s, mb_x, mb_y,
3304                                    (b[0]->mx + b[1]->mx + b[2]->mx + b[3]->mx + 2) >> 2,
3305                                    (b[0]->my + b[1]->my + b[2]->my + b[3]->my + 2) >> 2, 0, &best_rd);
3306
3307                 for(i=0; i<4; i++)
3308                     if(!(b[i]->type&BLOCK_INTRA))
3309                         check_4block_inter(s, mb_x, mb_y, b[i]->mx, b[i]->my, b[i]->ref, &best_rd);
3310
3311                 if(init_rd != best_rd)
3312                     change++;
3313             }
3314         }
3315         av_log(NULL, AV_LOG_ERROR, "pass:4mv changed:%d\n", change*4);
3316     }
3317 }
3318
3319 static void quantize(SnowContext *s, SubBand *b, IDWTELEM *dst, DWTELEM *src, int stride, int bias){
3320     const int level= b->level;
3321     const int w= b->width;
3322     const int h= b->height;
3323     const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
3324     const int qmul= qexp[qlog&(QROOT-1)]<<((qlog>>QSHIFT) + ENCODER_EXTRA_BITS);
3325     int x,y, thres1, thres2;
3326 //    START_TIMER
3327
3328     if(s->qlog == LOSSLESS_QLOG){
3329         for(y=0; y<h; y++)
3330             for(x=0; x<w; x++)
3331                 dst[x + y*stride]= src[x + y*stride];
3332         return;
3333     }
3334
3335     bias= bias ? 0 : (3*qmul)>>3;
3336     thres1= ((qmul - bias)>>QEXPSHIFT) - 1;
3337     thres2= 2*thres1;
3338
3339     if(!bias){
3340         for(y=0; y<h; y++){
3341             for(x=0; x<w; x++){
3342                 int i= src[x + y*stride];
3343
3344                 if((unsigned)(i+thres1) > thres2){
3345                     if(i>=0){
3346                         i<<= QEXPSHIFT;
3347                         i/= qmul; //FIXME optimize
3348                         dst[x + y*stride]=  i;
3349                     }else{
3350                         i= -i;
3351                         i<<= QEXPSHIFT;
3352                         i/= qmul; //FIXME optimize
3353                         dst[x + y*stride]= -i;
3354                     }
3355                 }else
3356                     dst[x + y*stride]= 0;
3357             }
3358         }
3359     }else{
3360         for(y=0; y<h; y++){
3361             for(x=0; x<w; x++){
3362                 int i= src[x + y*stride];
3363
3364                 if((unsigned)(i+thres1) > thres2){
3365                     if(i>=0){
3366                         i<<= QEXPSHIFT;
3367                         i= (i + bias) / qmul; //FIXME optimize
3368                         dst[x + y*stride]=  i;
3369                     }else{
3370                         i= -i;
3371                         i<<= QEXPSHIFT;
3372                         i= (i + bias) / qmul; //FIXME optimize
3373                         dst[x + y*stride]= -i;
3374                     }
3375                 }else
3376                     dst[x + y*stride]= 0;
3377             }
3378         }
3379     }
3380     if(level+1 == s->spatial_decomposition_count){
3381 //        STOP_TIMER("quantize")
3382     }
3383 }
3384
3385 static void dequantize_slice_buffered(SnowContext *s, slice_buffer * sb, SubBand *b, IDWTELEM *src, int stride, int start_y, int end_y){
3386     const int w= b->width;
3387     const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
3388     const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3389     const int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
3390     int x,y;
3391     START_TIMER
3392
3393     if(s->qlog == LOSSLESS_QLOG) return;
3394
3395     for(y=start_y; y<end_y; y++){
3396 //        DWTELEM * line = slice_buffer_get_line_from_address(sb, src + (y * stride));
3397         IDWTELEM * line = slice_buffer_get_line(sb, (y * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
3398         for(x=0; x<w; x++){
3399             int i= line[x];
3400             if(i<0){
3401                 line[x]= -((-i*qmul + qadd)>>(QEXPSHIFT)); //FIXME try different bias
3402             }else if(i>0){
3403                 line[x]=  (( i*qmul + qadd)>>(QEXPSHIFT));
3404             }
3405         }
3406     }
3407     if(w > 200 /*level+1 == s->spatial_decomposition_count*/){
3408         STOP_TIMER("dquant")
3409     }
3410 }
3411
3412 static void dequantize(SnowContext *s, SubBand *b, IDWTELEM *src, int stride){
3413     const int w= b->width;
3414     const int h= b->height;
3415     const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
3416     const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3417     const int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
3418     int x,y;
3419     START_TIMER
3420
3421     if(s->qlog == LOSSLESS_QLOG) return;
3422
3423     for(y=0; y<h; y++){
3424         for(x=0; x<w; x++){
3425             int i= src[x + y*stride];
3426             if(i<0){
3427                 src[x + y*stride]= -((-i*qmul + qadd)>>(QEXPSHIFT)); //FIXME try different bias
3428             }else if(i>0){
3429                 src[x + y*stride]=  (( i*qmul + qadd)>>(QEXPSHIFT));
3430             }
3431         }
3432     }
3433     if(w > 200 /*level+1 == s->spatial_decomposition_count*/){
3434         STOP_TIMER("dquant")
3435     }
3436 }
3437
3438 static void decorrelate(SnowContext *s, SubBand *b, IDWTELEM *src, int stride, int inverse, int use_median){
3439     const int w= b->width;
3440     const int h= b->height;
3441     int x,y;
3442
3443     for(y=h-1; y>=0; y--){
3444         for(x=w-1; x>=0; x--){
3445             int i= x + y*stride;
3446
3447             if(x){
3448                 if(use_median){
3449                     if(y && x+1<w) src[i] -= mid_pred(src[i - 1], src[i - stride], src[i - stride + 1]);
3450                     else  src[i] -= src[i - 1];
3451                 }else{
3452                     if(y) src[i] -= mid_pred(src[i - 1], src[i - stride], src[i - 1] + src[i - stride] - src[i - 1 - stride]);
3453                     else  src[i] -= src[i - 1];
3454                 }
3455             }else{
3456                 if(y) src[i] -= src[i - stride];
3457             }
3458         }
3459     }
3460 }
3461
3462 static void correlate_slice_buffered(SnowContext *s, slice_buffer * sb, SubBand *b, IDWTELEM *src, int stride, int inverse, int use_median, int start_y, int end_y){
3463     const int w= b->width;
3464     int x,y;
3465
3466 //    START_TIMER
3467
3468     IDWTELEM * line=0; // silence silly "could be used without having been initialized" warning
3469     IDWTELEM * prev;
3470
3471     if (start_y != 0)
3472         line = slice_buffer_get_line(sb, ((start_y - 1) * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
3473
3474     for(y=start_y; y<end_y; y++){
3475         prev = line;
3476 //        line = slice_buffer_get_line_from_address(sb, src + (y * stride));
3477         line = slice_buffer_get_line(sb, (y * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
3478         for(x=0; x<w; x++){
3479             if(x){
3480                 if(use_median){
3481                     if(y && x+1<w) line[x] += mid_pred(line[x - 1], prev[x], prev[x + 1]);
3482                     else  line[x] += line[x - 1];
3483                 }else{
3484                     if(y) line[x] += mid_pred(line[x - 1], prev[x], line[x - 1] + prev[x] - prev[x - 1]);
3485                     else  line[x] += line[x - 1];
3486                 }
3487             }else{
3488                 if(y) line[x] += prev[x];
3489             }
3490         }
3491     }
3492
3493 //    STOP_TIMER("correlate")
3494 }
3495
3496 static void correlate(SnowContext *s, SubBand *b, IDWTELEM *src, int stride, int inverse, int use_median){
3497     const int w= b->width;
3498     const int h= b->height;
3499     int x,y;
3500
3501     for(y=0; y<h; y++){
3502         for(x=0; x<w; x++){
3503             int i= x + y*stride;
3504
3505             if(x){
3506                 if(use_median){
3507                     if(y && x+1<w) src[i] += mid_pred(src[i - 1], src[i - stride], src[i - stride + 1]);
3508                     else  src[i] += src[i - 1];
3509                 }else{
3510                     if(y) src[i] += mid_pred(src[i - 1], src[i - stride], src[i - 1] + src[i - stride] - src[i - 1 - stride]);
3511                     else  src[i] += src[i - 1];
3512                 }
3513             }else{
3514                 if(y) src[i] += src[i - stride];
3515             }
3516         }
3517     }
3518 }
3519
3520 static void encode_qlogs(SnowContext *s){
3521     int plane_index, level, orientation;
3522
3523     for(plane_index=0; plane_index<2; plane_index++){
3524         for(level=0; level<s->spatial_decomposition_count; level++){
3525             for(orientation=level ? 1:0; orientation<4; orientation++){
3526                 if(orientation==2) continue;
3527                 put_symbol(&s->c, s->header_state, s->plane[plane_index].band[level][orientation].qlog, 1);
3528             }
3529         }
3530     }
3531 }
3532
3533 static void encode_header(SnowContext *s){
3534     int plane_index, i;
3535     uint8_t kstate[32];
3536
3537     memset(kstate, MID_STATE, sizeof(kstate));
3538
3539     put_rac(&s->c, kstate, s->keyframe);
3540     if(s->keyframe || s->always_reset){
3541         reset_contexts(s);
3542         s->last_spatial_decomposition_type=
3543         s->last_qlog=
3544         s->last_qbias=
3545         s->last_mv_scale=
3546         s->last_block_max_depth= 0;
3547         for(plane_index=0; plane_index<2; plane_index++){
3548             Plane *p= &s->plane[plane_index];
3549             p->last_htaps=0;
3550             p->last_diag_mc=0;
3551             memset(p->last_hcoeff, 0, sizeof(p->last_hcoeff));
3552         }
3553     }
3554     if(s->keyframe){
3555         put_symbol(&s->c, s->header_state, s->version, 0);
3556         put_rac(&s->c, s->header_state, s->always_reset);
3557         put_symbol(&s->c, s->header_state, s->temporal_decomposition_type, 0);
3558         put_symbol(&s->c, s->header_state, s->temporal_decomposition_count, 0);
3559         put_symbol(&s->c, s->header_state, s->spatial_decomposition_count, 0);
3560         put_symbol(&s->c, s->header_state, s->colorspace_type, 0);
3561         put_symbol(&s->c, s->header_state, s->chroma_h_shift, 0);
3562         put_symbol(&s->c, s->header_state, s->chroma_v_shift, 0);
3563         put_rac(&s->c, s->header_state, s->spatial_scalability);
3564 //        put_rac(&s->c, s->header_state, s->rate_scalability);
3565         put_symbol(&s->c, s->header_state, s->max_ref_frames-1, 0);
3566
3567         encode_qlogs(s);
3568     }
3569
3570     if(!s->keyframe){
3571         int update_mc=0;
3572         for(plane_index=0; plane_index<2; plane_index++){
3573             Plane *p= &s->plane[plane_index];
3574             update_mc |= p->last_htaps   != p->htaps;
3575             update_mc |= p->last_diag_mc != p->diag_mc;
3576             update_mc |= !!memcmp(p->last_hcoeff, p->hcoeff, sizeof(p->hcoeff));
3577         }
3578         put_rac(&s->c, s->header_state, update_mc);
3579         if(update_mc){
3580             for(plane_index=0; plane_index<2; plane_index++){
3581                 Plane *p= &s->plane[plane_index];
3582                 put_rac(&s->c, s->header_state, p->diag_mc);
3583                 put_symbol(&s->c, s->header_state, p->htaps/2-1, 0);
3584                 for(i= p->htaps/2; i; i--)
3585                     put_symbol(&s->c, s->header_state, FFABS(p->hcoeff[i]), 0);
3586             }
3587         }
3588         if(s->last_spatial_decomposition_count != s->spatial_decomposition_count){
3589             put_rac(&s->c, s->header_state, 1);
3590             put_symbol(&s->c, s->header_state, s->spatial_decomposition_count, 0);
3591             encode_qlogs(s);
3592         }else
3593             put_rac(&s->c, s->header_state, 0);
3594     }
3595
3596     put_symbol(&s->c, s->header_state, s->spatial_decomposition_type - s->last_spatial_decomposition_type, 1);
3597     put_symbol(&s->c, s->header_state, s->qlog            - s->last_qlog    , 1);
3598     put_symbol(&s->c, s->header_state, s->mv_scale        - s->last_mv_scale, 1);
3599     put_symbol(&s->c, s->header_state, s->qbias           - s->last_qbias   , 1);
3600     put_symbol(&s->c, s->header_state, s->block_max_depth - s->last_block_max_depth, 1);
3601
3602 }
3603
3604 static void update_last_header_values(SnowContext *s){
3605     int plane_index;
3606
3607     if(!s->keyframe){
3608         for(plane_index=0; plane_index<2; plane_index++){
3609             Plane *p= &s->plane[plane_index];
3610             p->last_diag_mc= p->diag_mc;
3611             p->last_htaps  = p->htaps;
3612             memcpy(p->last_hcoeff, p->hcoeff, sizeof(p->hcoeff));
3613         }
3614     }
3615
3616     s->last_spatial_decomposition_type= s->spatial_decomposition_type;
3617     s->last_qlog                      = s->qlog;
3618     s->last_qbias                     = s->qbias;
3619     s->last_mv_scale                  = s->mv_scale;
3620     s->last_block_max_depth           = s->block_max_depth;
3621     s->last_spatial_decomposition_count= s->spatial_decomposition_count;
3622 }
3623
3624 static void decode_qlogs(SnowContext *s){
3625     int plane_index, level, orientation;
3626
3627     for(plane_index=0; plane_index<3; plane_index++){
3628         for(level=0; level<s->spatial_decomposition_count; level++){
3629             for(orientation=level ? 1:0; orientation<4; orientation++){
3630                 int q;
3631                 if     (plane_index==2) q= s->plane[1].band[level][orientation].qlog;
3632                 else if(orientation==2) q= s->plane[plane_index].band[level][1].qlog;
3633                 else                    q= get_symbol(&s->c, s->header_state, 1);
3634                 s->plane[plane_index].band[level][orientation].qlog= q;
3635             }
3636         }
3637     }
3638 }
3639
3640 static int decode_header(SnowContext *s){
3641     int plane_index;
3642     uint8_t kstate[32];
3643
3644     memset(kstate, MID_STATE, sizeof(kstate));
3645
3646     s->keyframe= get_rac(&s->c, kstate);
3647     if(s->keyframe || s->always_reset){
3648         reset_contexts(s);
3649         s->spatial_decomposition_type=
3650         s->qlog=
3651         s->qbias=
3652         s->mv_scale=
3653         s->block_max_depth= 0;
3654     }
3655     if(s->keyframe){
3656         s->version= get_symbol(&s->c, s->header_state, 0);
3657         if(s->version>0){
3658             av_log(s->avctx, AV_LOG_ERROR, "version %d not supported", s->version);
3659             return -1;
3660         }
3661         s->always_reset= get_rac(&s->c, s->header_state);
3662         s->temporal_decomposition_type= get_symbol(&s->c, s->header_state, 0);
3663         s->temporal_decomposition_count= get_symbol(&s->c, s->header_state, 0);
3664         s->spatial_decomposition_count= get_symbol(&s->c, s->header_state, 0);
3665         s->colorspace_type= get_symbol(&s->c, s->header_state, 0);
3666         s->chroma_h_shift= get_symbol(&s->c, s->header_state, 0);
3667         s->chroma_v_shift= get_symbol(&s->c, s->header_state, 0);
3668         s->spatial_scalability= get_rac(&s->c, s->header_state);
3669 //        s->rate_scalability= get_rac(&s->c, s->header_state);
3670         s->max_ref_frames= get_symbol(&s->c, s->header_state, 0)+1;
3671
3672         decode_qlogs(s);
3673     }
3674
3675     if(!s->keyframe){
3676         if(get_rac(&s->c, s->header_state)){
3677             for(plane_index=0; plane_index<2; plane_index++){
3678                 int htaps, i, sum=0;
3679                 Plane *p= &s->plane[plane_index];
3680                 p->diag_mc= get_rac(&s->c, s->header_state);
3681                 htaps= get_symbol(&s->c, s->header_state, 0)*2 + 2;
3682                 if((unsigned)htaps > HTAPS_MAX || htaps==0)
3683                     return -1;
3684                 p->htaps= htaps;
3685                 for(i= htaps/2; i; i--){
3686                     p->hcoeff[i]= get_symbol(&s->c, s->header_state, 0) * (1-2*(i&1));
3687                     sum += p->hcoeff[i];
3688                 }
3689                 p->hcoeff[0]= 32-sum;
3690             }
3691             s->plane[2].diag_mc= s->plane[1].diag_mc;
3692             s->plane[2].htaps  = s->plane[1].htaps;
3693             memcpy(s->plane[2].hcoeff, s->plane[1].hcoeff, sizeof(s->plane[1].hcoeff));
3694         }
3695         if(get_rac(&s->c, s->header_state)){
3696             s->spatial_decomposition_count= get_symbol(&s->c, s->header_state, 0);
3697             decode_qlogs(s);
3698         }
3699     }
3700
3701     s->spatial_decomposition_type+= get_symbol(&s->c, s->header_state, 1);
3702     if(s->spatial_decomposition_type > 1){
3703         av_log(s->avctx, AV_LOG_ERROR, "spatial_decomposition_type %d not supported", s->spatial_decomposition_type);
3704         return -1;
3705     }
3706
3707     s->qlog           += get_symbol(&s->c, s->header_state, 1);
3708     s->mv_scale       += get_symbol(&s->c, s->header_state, 1);
3709     s->qbias          += get_symbol(&s->c, s->header_state, 1);
3710     s->block_max_depth+= get_symbol(&s->c, s->header_state, 1);
3711     if(s->block_max_depth > 1 || s->block_max_depth < 0){
3712         av_log(s->avctx, AV_LOG_ERROR, "block_max_depth= %d is too large", s->block_max_depth);
3713         s->block_max_depth= 0;
3714         return -1;
3715     }
3716
3717     return 0;
3718 }
3719
3720 static void init_qexp(void){
3721     int i;
3722     double v=128;
3723
3724     for(i=0; i<QROOT; i++){
3725         qexp[i]= lrintf(v);
3726         v *= pow(2, 1.0 / QROOT);
3727     }
3728 }
3729
3730 static int common_init(AVCodecContext *avctx){
3731     SnowContext *s = avctx->priv_data;
3732     int width, height;
3733     int i, j;
3734
3735     s->avctx= avctx;
3736
3737     dsputil_init(&s->dsp, avctx);
3738
3739 #define mcf(dx,dy)\
3740     s->dsp.put_qpel_pixels_tab       [0][dy+dx/4]=\
3741     s->dsp.put_no_rnd_qpel_pixels_tab[0][dy+dx/4]=\
3742         s->dsp.put_h264_qpel_pixels_tab[0][dy+dx/4];\
3743     s->dsp.put_qpel_pixels_tab       [1][dy+dx/4]=\
3744     s->dsp.put_no_rnd_qpel_pixels_tab[1][dy+dx/4]=\
3745         s->dsp.put_h264_qpel_pixels_tab[1][dy+dx/4];
3746
3747     mcf( 0, 0)
3748     mcf( 4, 0)
3749     mcf( 8, 0)
3750     mcf(12, 0)
3751     mcf( 0, 4)
3752     mcf( 4, 4)
3753     mcf( 8, 4)
3754     mcf(12, 4)
3755     mcf( 0, 8)
3756     mcf( 4, 8)
3757     mcf( 8, 8)
3758     mcf(12, 8)
3759     mcf( 0,12)
3760     mcf( 4,12)
3761     mcf( 8,12)
3762     mcf(12,12)
3763
3764 #define mcfh(dx,dy)\
3765     s->dsp.put_pixels_tab       [0][dy/4+dx/8]=\
3766     s->dsp.put_no_rnd_pixels_tab[0][dy/4+dx/8]=\
3767         mc_block_hpel ## dx ## dy ## 16;\
3768     s->dsp.put_pixels_tab       [1][dy/4+dx/8]=\
3769     s->dsp.put_no_rnd_pixels_tab[1][dy/4+dx/8]=\
3770         mc_block_hpel ## dx ## dy ## 8;
3771
3772     mcfh(0, 0)
3773     mcfh(8, 0)
3774     mcfh(0, 8)
3775     mcfh(8, 8)
3776
3777     if(!qexp[0])
3778         init_qexp();
3779
3780 //    dec += FFMAX(s->chroma_h_shift, s->chroma_v_shift);
3781
3782     width= s->avctx->width;
3783     height= s->avctx->height;
3784
3785     s->spatial_idwt_buffer= av_mallocz(width*height*sizeof(IDWTELEM));
3786     s->spatial_dwt_buffer= av_mallocz(width*height*sizeof(DWTELEM)); //FIXME this doesnt belong here
3787
3788     for(i=0; i<MAX_REF_FRAMES; i++)
3789         for(j=0; j<MAX_REF_FRAMES; j++)
3790             scale_mv_ref[i][j] = 256*(i+1)/(j+1);
3791
3792     s->avctx->get_buffer(s->avctx, &s->mconly_picture);
3793
3794     return 0;
3795 }
3796
3797 static int common_init_after_header(AVCodecContext *avctx){
3798     SnowContext *s = avctx->priv_data;
3799     int plane_index, level, orientation;
3800
3801     for(plane_index=0; plane_index<3; plane_index++){
3802         int w= s->avctx->width;
3803         int h= s->avctx->height;
3804
3805         if(plane_index){
3806             w>>= s->chroma_h_shift;
3807             h>>= s->chroma_v_shift;
3808         }
3809         s->plane[plane_index].width = w;
3810         s->plane[plane_index].height= h;
3811
3812 //av_log(NULL, AV_LOG_DEBUG, "%d %d\n", w, h);
3813         for(level=s->spatial_decomposition_count-1; level>=0; level--){
3814             for(orientation=level ? 1 : 0; orientation<4; orientation++){
3815                 SubBand *b= &s->plane[plane_index].band[level][orientation];
3816
3817                 b->buf= s->spatial_dwt_buffer;
3818                 b->level= level;
3819                 b->stride= s->plane[plane_index].width << (s->spatial_decomposition_count - level);
3820                 b->width = (w + !(orientation&1))>>1;
3821                 b->height= (h + !(orientation>1))>>1;
3822
3823                 b->stride_line = 1 << (s->spatial_decomposition_count - level);
3824                 b->buf_x_offset = 0;
3825                 b->buf_y_offset = 0;
3826
3827                 if(orientation&1){
3828                     b->buf += (w+1)>>1;
3829                     b->buf_x_offset = (w+1)>>1;
3830                 }
3831                 if(orientation>1){
3832                     b->buf += b->stride>>1;
3833                     b->buf_y_offset = b->stride_line >> 1;
3834                 }
3835                 b->ibuf= s->spatial_idwt_buffer + (b->buf - s->spatial_dwt_buffer);
3836
3837                 if(level)
3838                     b->parent= &s->plane[plane_index].band[level-1][orientation];
3839                 //FIXME avoid this realloc
3840                 av_freep(&b->x_coeff);
3841                 b->x_coeff=av_mallocz(((b->width+1) * b->height+1)*sizeof(x_and_coeff));
3842             }
3843             w= (w+1)>>1;
3844             h= (h+1)>>1;
3845         }
3846     }
3847
3848     return 0;
3849 }
3850
3851 static int qscale2qlog(int qscale){
3852     return rint(QROOT*log(qscale / (float)FF_QP2LAMBDA)/log(2))
3853            + 61*QROOT/8; //<64 >60
3854 }
3855
3856 static int ratecontrol_1pass(SnowContext *s, AVFrame *pict)
3857 {
3858     /* estimate the frame's complexity as a sum of weighted dwt coefs.
3859      * FIXME we know exact mv bits at this point,
3860      * but ratecontrol isn't set up to include them. */
3861     uint32_t coef_sum= 0;
3862     int level, orientation, delta_qlog;
3863
3864     for(level=0; level<s->spatial_decomposition_count; level++){
3865         for(orientation=level ? 1 : 0; orientation<4; orientation++){
3866             SubBand *b= &s->plane[0].band[level][orientation];
3867             IDWTELEM *buf= b->ibuf;
3868             const int w= b->width;
3869             const int h= b->height;
3870             const int stride= b->stride;
3871             const int qlog= av_clip(2*QROOT + b->qlog, 0, QROOT*16);
3872             const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3873             const int qdiv= (1<<16)/qmul;
3874             int x, y;
3875             //FIXME this is ugly
3876             for(y=0; y<h; y++)
3877                 for(x=0; x<w; x++)
3878                     buf[x+y*stride]= b->buf[x+y*stride];
3879             if(orientation==0)
3880                 decorrelate(s, b, buf, stride, 1, 0);
3881             for(y=0; y<h; y++)
3882                 for(x=0; x<w; x++)
3883                     coef_sum+= abs(buf[x+y*stride]) * qdiv >> 16;
3884         }
3885     }
3886
3887     /* ugly, ratecontrol just takes a sqrt again */
3888     coef_sum = (uint64_t)coef_sum * coef_sum >> 16;
3889     assert(coef_sum < INT_MAX);
3890
3891     if(pict->pict_type == I_TYPE){
3892         s->m.current_picture.mb_var_sum= coef_sum;
3893         s->m.current_picture.mc_mb_var_sum= 0;
3894     }else{
3895         s->m.current_picture.mc_mb_var_sum= coef_sum;
3896         s->m.current_picture.mb_var_sum= 0;
3897     }
3898
3899     pict->quality= ff_rate_estimate_qscale(&s->m, 1);
3900     if (pict->quality < 0)
3901         return INT_MIN;
3902     s->lambda= pict->quality * 3/2;
3903     delta_qlog= qscale2qlog(pict->quality) - s->qlog;
3904     s->qlog+= delta_qlog;
3905     return delta_qlog;
3906 }
3907
3908 static void calculate_visual_weight(SnowContext *s, Plane *p){
3909     int width = p->width;
3910     int height= p->height;
3911     int level, orientation, x, y;
3912
3913     for(level=0; level<s->spatial_decomposition_count; level++){
3914         for(orientation=level ? 1 : 0; orientation<4; orientation++){
3915             SubBand *b= &p->band[level][orientation];
3916             IDWTELEM *ibuf= b->ibuf;
3917             int64_t error=0;
3918
3919             memset(s->spatial_idwt_buffer, 0, sizeof(*s->spatial_idwt_buffer)*width*height);
3920             ibuf[b->width/2 + b->height/2*b->stride]= 256*16;
3921             ff_spatial_idwt(s->spatial_idwt_buffer, width, height, width, s->spatial_decomposition_type, s->spatial_decomposition_count);
3922             for(y=0; y<height; y++){
3923                 for(x=0; x<width; x++){
3924                     int64_t d= s->spatial_idwt_buffer[x + y*width]*16;
3925                     error += d*d;
3926                 }
3927             }
3928
3929             b->qlog= (int)(log(352256.0/sqrt(error)) / log(pow(2.0, 1.0/QROOT))+0.5);
3930 //            av_log(NULL, AV_LOG_DEBUG, "%d %d %d\n", level, orientation, b->qlog/*, sqrt(error)*/);
3931         }
3932     }
3933 }
3934
3935 #define QUANTIZE2 0
3936
3937 #if QUANTIZE2==1
3938 #define Q2_STEP 8
3939
3940 static void find_sse(SnowContext *s, Plane *p, int *score, int score_stride, IDWTELEM *r0, IDWTELEM *r1, int level, int orientation){
3941     SubBand *b= &p->band[level][orientation];
3942     int x, y;
3943     int xo=0;
3944     int yo=0;
3945     int step= 1 << (s->spatial_decomposition_count - level);
3946
3947     if(orientation&1)
3948         xo= step>>1;
3949     if(orientation&2)
3950         yo= step>>1;
3951
3952     //FIXME bias for non zero ?
3953     //FIXME optimize
3954     memset(score, 0, sizeof(*score)*score_stride*((p->height + Q2_STEP-1)/Q2_STEP));
3955     for(y=0; y<p->height; y++){
3956         for(x=0; x<p->width; x++){
3957             int sx= (x-xo + step/2) / step / Q2_STEP;
3958             int sy= (y-yo + step/2) / step / Q2_STEP;
3959             int v= r0[x + y*p->width] - r1[x + y*p->width];
3960             assert(sx>=0 && sy>=0 && sx < score_stride);
3961             v= ((v+8)>>4)<<4;
3962             score[sx + sy*score_stride] += v*v;
3963             assert(score[sx + sy*score_stride] >= 0);
3964         }
3965     }
3966 }
3967
3968 static void dequantize_all(SnowContext *s, Plane *p, IDWTELEM *buffer, int width, int height){
3969     int level, orientation;
3970
3971     for(level=0; level<s->spatial_decomposition_count; level++){
3972         for(orientation=level ? 1 : 0; orientation<4; orientation++){
3973             SubBand *b= &p->band[level][orientation];
3974             IDWTELEM *dst= buffer + (b->ibuf - s->spatial_idwt_buffer);
3975
3976             dequantize(s, b, dst, b->stride);
3977         }
3978     }
3979 }
3980
3981 static void dwt_quantize(SnowContext *s, Plane *p, DWTELEM *buffer, int width, int height, int stride, int type){
3982     int level, orientation, ys, xs, x, y, pass;
3983     IDWTELEM best_dequant[height * stride];
3984     IDWTELEM idwt2_buffer[height * stride];
3985     const int score_stride= (width + 10)/Q2_STEP;
3986     int best_score[(width + 10)/Q2_STEP * (height + 10)/Q2_STEP]; //FIXME size
3987     int score[(width + 10)/Q2_STEP * (height + 10)/Q2_STEP]; //FIXME size
3988     int threshold= (s->m.lambda * s->m.lambda) >> 6;
3989
3990     //FIXME pass the copy cleanly ?
3991
3992 //    memcpy(dwt_buffer, buffer, height * stride * sizeof(DWTELEM));
3993     ff_spatial_dwt(buffer, width, height, stride, type, s->spatial_decomposition_count);
3994
3995     for(level=0; level<s->spatial_decomposition_count; level++){
3996         for(orientation=level ? 1 : 0; orientation<4; orientation++){
3997             SubBand *b= &p->band[level][orientation];
3998             IDWTELEM *dst= best_dequant + (b->ibuf - s->spatial_idwt_buffer);
3999              DWTELEM *src=       buffer + (b-> buf - s->spatial_dwt_buffer);
4000             assert(src == b->buf); // code doesnt depen on this but its true currently
4001
4002             quantize(s, b, dst, src, b->stride, s->qbias);
4003         }
4004     }
4005     for(pass=0; pass<1; pass++){
4006         if(s->qbias == 0) //keyframe
4007             continue;
4008         for(level=0; level<s->spatial_decomposition_count; level++){
4009             for(orientation=level ? 1 : 0; orientation<4; orientation++){
4010                 SubBand *b= &p->band[level][orientation];
4011                 IDWTELEM *dst= idwt2_buffer + (b->ibuf - s->spatial_idwt_buffer);
4012                 IDWTELEM *best_dst= best_dequant + (b->ibuf - s->spatial_idwt_buffer);
4013
4014                 for(ys= 0; ys<Q2_STEP; ys++){
4015                     for(xs= 0; xs<Q2_STEP; xs++){
4016                         memcpy(idwt2_buffer, best_dequant, height * stride * sizeof(IDWTELEM));
4017                         dequantize_all(s, p, idwt2_buffer, width, height);
4018                         ff_spatial_idwt(idwt2_buffer, width, height, stride, type, s->spatial_decomposition_count);
4019                         find_sse(s, p, best_score, score_stride, idwt2_buffer, s->spatial_idwt_buffer, level, orientation);
4020                         memcpy(idwt2_buffer, best_dequant, height * stride * sizeof(IDWTELEM));
4021                         for(y=ys; y<b->height; y+= Q2_STEP){
4022                             for(x=xs; x<b->width; x+= Q2_STEP){
4023                                 if(dst[x + y*b->stride]<0) dst[x + y*b->stride]++;
4024                                 if(dst[x + y*b->stride]>0) dst[x + y*b->stride]--;
4025                                 //FIXME try more then just --
4026                             }
4027                         }
4028                         dequantize_all(s, p, idwt2_buffer, width, height);
4029                         ff_spatial_idwt(idwt2_buffer, width, height, stride, type, s->spatial_decomposition_count);
4030                         find_sse(s, p, score, score_stride, idwt2_buffer, s->spatial_idwt_buffer, level, orientation);
4031                         for(y=ys; y<b->height; y+= Q2_STEP){
4032                             for(x=xs; x<b->width; x+= Q2_STEP){
4033                                 int score_idx= x/Q2_STEP + (y/Q2_STEP)*score_stride;
4034                                 if(score[score_idx] <= best_score[score_idx] + threshold){
4035                                     best_score[score_idx]= score[score_idx];
4036                                     if(best_dst[x + y*b->stride]<0) best_dst[x + y*b->stride]++;
4037                                     if(best_dst[x + y*b->stride]>0) best_dst[x + y*b->stride]--;
4038                                     //FIXME copy instead
4039                                 }
4040                             }
4041                         }
4042                     }
4043                 }
4044             }
4045         }
4046     }
4047     memcpy(s->spatial_idwt_buffer, best_dequant, height * stride * sizeof(IDWTELEM)); //FIXME work with that directly insteda of copy at the end
4048 }
4049
4050 #endif /* QUANTIZE2==1 */
4051
4052 static int encode_init(AVCodecContext *avctx)
4053 {
4054     SnowContext *s = avctx->priv_data;
4055     int plane_index;
4056
4057     if(avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL){
4058         av_log(avctx, AV_LOG_ERROR, "this codec is under development, files encoded with it may not be decodable with future versions!!!\n"
4059                "use vstrict=-2 / -strict -2 to use it anyway\n");
4060         return -1;
4061     }
4062
4063     if(avctx->prediction_method == DWT_97
4064        && (avctx->flags & CODEC_FLAG_QSCALE)
4065        && avctx->global_quality == 0){
4066         av_log(avctx, AV_LOG_ERROR, "the 9/7 wavelet is incompatible with lossless mode\n");
4067         return -1;
4068     }
4069
4070     s->spatial_decomposition_type= avctx->prediction_method; //FIXME add decorrelator type r transform_type
4071
4072     s->chroma_h_shift= 1; //FIXME XXX
4073     s->chroma_v_shift= 1;
4074
4075     s->mv_scale       = (avctx->flags & CODEC_FLAG_QPEL) ? 2 : 4;
4076     s->block_max_depth= (avctx->flags & CODEC_FLAG_4MV ) ? 1 : 0;
4077
4078     for(plane_index=0; plane_index<3; plane_index++){
4079         s->plane[plane_index].diag_mc= 1;
4080         s->plane[plane_index].htaps= 6;
4081         s->plane[plane_index].hcoeff[0]=  40;
4082         s->plane[plane_index].hcoeff[1]= -10;
4083         s->plane[plane_index].hcoeff[2]=   2;
4084         s->plane[plane_index].fast_mc= 1;
4085     }
4086
4087     common_init(avctx);
4088     alloc_blocks(s);
4089
4090     s->version=0;
4091
4092     s->m.avctx   = avctx;
4093     s->m.flags   = avctx->flags;
4094     s->m.bit_rate= avctx->bit_rate;
4095
4096     s->m.me.scratchpad= av_mallocz((avctx->width+64)*2*16*2*sizeof(uint8_t));
4097     s->m.me.map       = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t));
4098     s->m.me.score_map = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t));
4099     s->m.obmc_scratchpad= av_mallocz(MB_SIZE*MB_SIZE*12*sizeof(uint32_t));
4100     h263_encode_init(&s->m); //mv_penalty
4101
4102     s->max_ref_frames = FFMAX(FFMIN(avctx->refs, MAX_REF_FRAMES), 1);
4103
4104     if(avctx->flags&CODEC_FLAG_PASS1){
4105         if(!avctx->stats_out)
4106             avctx->stats_out = av_mallocz(256);
4107     }
4108     if((avctx->flags&CODEC_FLAG_PASS2) || !(avctx->flags&CODEC_FLAG_QSCALE)){
4109         if(ff_rate_control_init(&s->m) < 0)
4110             return -1;
4111     }
4112     s->pass1_rc= !(avctx->flags & (CODEC_FLAG_QSCALE|CODEC_FLAG_PASS2));
4113
4114     avctx->coded_frame= &s->current_picture;
4115     switch(avctx->pix_fmt){
4116 //    case PIX_FMT_YUV444P:
4117 //    case PIX_FMT_YUV422P:
4118     case PIX_FMT_YUV420P:
4119     case PIX_FMT_GRAY8:
4120 //    case PIX_FMT_YUV411P:
4121 //    case PIX_FMT_YUV410P:
4122         s->colorspace_type= 0;
4123         break;
4124 /*    case PIX_FMT_RGB32:
4125         s->colorspace= 1;
4126         break;*/
4127     default:
4128         av_log(avctx, AV_LOG_ERROR, "pixel format not supported\n");
4129         return -1;
4130     }
4131 //    avcodec_get_chroma_sub_sample(avctx->pix_fmt, &s->chroma_h_shift, &s->chroma_v_shift);
4132     s->chroma_h_shift= 1;
4133     s->chroma_v_shift= 1;
4134
4135     ff_set_cmp(&s->dsp, s->dsp.me_cmp, s->avctx->me_cmp);
4136     ff_set_cmp(&s->dsp, s->dsp.me_sub_cmp, s->avctx->me_sub_cmp);
4137
4138     s->avctx->get_buffer(s->avctx, &s->input_picture);
4139
4140     if(s->avctx->me_method == ME_ITER){
4141         int i;
4142         int size= s->b_width * s->b_height << 2*s->block_max_depth;
4143         for(i=0; i<s->max_ref_frames; i++){
4144             s->ref_mvs[i]= av_mallocz(size*sizeof(int16_t[2]));
4145             s->ref_scores[i]= av_mallocz(size*sizeof(uint32_t));
4146         }
4147     }
4148
4149     return 0;
4150 }
4151
4152 #define USE_HALFPEL_PLANE 0
4153
4154 static void halfpel_interpol(SnowContext *s, uint8_t *halfpel[4][4], AVFrame *frame){
4155     int p,x,y;
4156
4157     assert(!(s->avctx->flags & CODEC_FLAG_EMU_EDGE));
4158
4159     for(p=0; p<3; p++){
4160         int is_chroma= !!p;
4161         int w= s->avctx->width  >>is_chroma;
4162         int h= s->avctx->height >>is_chroma;
4163         int ls= frame->linesize[p];
4164         uint8_t *src= frame->data[p];
4165
4166         halfpel[1][p]= (uint8_t*)av_malloc(ls * (h+2*EDGE_WIDTH)) + EDGE_WIDTH*(1+ls);
4167         halfpel[2][p]= (uint8_t*)av_malloc(ls * (h+2*EDGE_WIDTH)) + EDGE_WIDTH*(1+ls);
4168         halfpel[3][p]= (uint8_t*)av_malloc(ls * (h+2*EDGE_WIDTH)) + EDGE_WIDTH*(1+ls);
4169
4170         halfpel[0][p]= src;
4171         for(y=0; y<h; y++){
4172             for(x=0; x<w; x++){
4173                 int i= y*ls + x;
4174
4175                 halfpel[1][p][i]= (20*(src[i] + src[i+1]) - 5*(src[i-1] + src[i+2]) + (src[i-2] + src[i+3]) + 16 )>>5;
4176             }
4177         }
4178         for(y=0; y<h; y++){
4179             for(x=0; x<w; x++){
4180                 int i= y*ls + x;
4181
4182                 halfpel[2][p][i]= (20*(src[i] + src[i+ls]) - 5*(src[i-ls] + src[i+2*ls]) + (src[i-2*ls] + src[i+3*ls]) + 16 )>>5;
4183             }
4184         }
4185         src= halfpel[1][p];
4186         for(y=0; y<h; y++){
4187             for(x=0; x<w; x++){
4188                 int i= y*ls + x;
4189
4190                 halfpel[3][p][i]= (20*(src[i] + src[i+ls]) - 5*(src[i-ls] + src[i+2*ls]) + (src[i-2*ls] + src[i+3*ls]) + 16 )>>5;
4191             }
4192         }
4193
4194 //FIXME border!
4195     }
4196 }
4197
4198 static int frame_start(SnowContext *s){
4199    AVFrame tmp;
4200    int w= s->avctx->width; //FIXME round up to x16 ?
4201    int h= s->avctx->height;
4202
4203     if(s->current_picture.data[0]){
4204         draw_edges(s->current_picture.data[0], s->current_picture.linesize[0], w   , h   , EDGE_WIDTH  );
4205         draw_edges(s->current_picture.data[1], s->current_picture.linesize[1], w>>1, h>>1, EDGE_WIDTH/2);
4206         draw_edges(s->current_picture.data[2], s->current_picture.linesize[2], w>>1, h>>1, EDGE_WIDTH/2);
4207     }
4208
4209     tmp= s->last_picture[s->max_ref_frames-1];
4210     memmove(s->last_picture+1, s->last_picture, (s->max_ref_frames-1)*sizeof(AVFrame));
4211     memmove(s->halfpel_plane+1, s->halfpel_plane, (s->max_ref_frames-1)*sizeof(void*)*4*4);
4212     if(USE_HALFPEL_PLANE && s->current_picture.data[0])
4213         halfpel_interpol(s, s->halfpel_plane[0], &s->current_picture);
4214     s->last_picture[0]= s->current_picture;
4215     s->current_picture= tmp;
4216
4217     if(s->keyframe){
4218         s->ref_frames= 0;
4219     }else{
4220         int i;
4221         for(i=0; i<s->max_ref_frames && s->last_picture[i].data[0]; i++)
4222             if(i && s->last_picture[i-1].key_frame)
4223                 break;
4224         s->ref_frames= i;
4225     }
4226
4227     s->current_picture.reference= 1;
4228     if(s->avctx->get_buffer(s->avctx, &s->current_picture) < 0){
4229         av_log(s->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
4230         return -1;
4231     }
4232
4233     s->current_picture.key_frame= s->keyframe;
4234
4235     return 0;
4236 }
4237
4238 static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size, void *data){
4239     SnowContext *s = avctx->priv_data;
4240     RangeCoder * const c= &s->c;
4241     AVFrame *pict = data;
4242     const int width= s->avctx->width;
4243     const int height= s->avctx->height;
4244     int level, orientation, plane_index, i, y;
4245     uint8_t rc_header_bak[sizeof(s->header_state)];
4246     uint8_t rc_block_bak[sizeof(s->block_state)];
4247
4248     ff_init_range_encoder(c, buf, buf_size);
4249     ff_build_rac_states(c, 0.05*(1LL<<32), 256-8);
4250
4251     for(i=0; i<3; i++){
4252         int shift= !!i;
4253         for(y=0; y<(height>>shift); y++)
4254             memcpy(&s->input_picture.data[i][y * s->input_picture.linesize[i]],
4255                    &pict->data[i][y * pict->linesize[i]],
4256                    width>>shift);
4257     }
4258     s->new_picture = *pict;
4259
4260     s->m.picture_number= avctx->frame_number;
4261     if(avctx->flags&CODEC_FLAG_PASS2){
4262         s->m.pict_type =
4263         pict->pict_type= s->m.rc_context.entry[avctx->frame_number].new_pict_type;
4264         s->keyframe= pict->pict_type==FF_I_TYPE;
4265         if(!(avctx->flags&CODEC_FLAG_QSCALE)) {
4266             pict->quality= ff_rate_estimate_qscale(&s->m, 0);
4267             if (pict->quality < 0)
4268                 return -1;
4269         }
4270     }else{
4271         s->keyframe= avctx->gop_size==0 || avctx->frame_number % avctx->gop_size == 0;
4272         s->m.pict_type=
4273         pict->pict_type= s->keyframe ? FF_I_TYPE : FF_P_TYPE;
4274     }
4275
4276     if(s->pass1_rc && avctx->frame_number == 0)
4277         pict->quality= 2*FF_QP2LAMBDA;
4278     if(pict->quality){
4279         s->qlog= qscale2qlog(pict->quality);
4280         s->lambda = pict->quality * 3/2;
4281     }
4282     if(s->qlog < 0 || (!pict->quality && (avctx->flags & CODEC_FLAG_QSCALE))){
4283         s->qlog= LOSSLESS_QLOG;
4284         s->lambda = 0;
4285     }//else keep previous frame's qlog until after motion est
4286
4287     frame_start(s);
4288
4289     s->m.current_picture_ptr= &s->m.current_picture;
4290     if(pict->pict_type == P_TYPE){
4291         int block_width = (width +15)>>4;
4292         int block_height= (height+15)>>4;
4293         int stride= s->current_picture.linesize[0];
4294
4295         assert(s->current_picture.data[0]);
4296         assert(s->last_picture[0].data[0]);
4297
4298         s->m.avctx= s->avctx;
4299         s->m.current_picture.data[0]= s->current_picture.data[0];
4300         s->m.   last_picture.data[0]= s->last_picture[0].data[0];
4301         s->m.    new_picture.data[0]= s->  input_picture.data[0];
4302         s->m.   last_picture_ptr= &s->m.   last_picture;
4303         s->m.linesize=
4304         s->m.   last_picture.linesize[0]=
4305         s->m.    new_picture.linesize[0]=
4306         s->m.current_picture.linesize[0]= stride;
4307         s->m.uvlinesize= s->current_picture.linesize[1];
4308         s->m.width = width;
4309         s->m.height= height;
4310         s->m.mb_width = block_width;
4311         s->m.mb_height= block_height;
4312         s->m.mb_stride=   s->m.mb_width+1;
4313         s->m.b8_stride= 2*s->m.mb_width+1;
4314         s->m.f_code=1;
4315         s->m.pict_type= pict->pict_type;
4316         s->m.me_method= s->avctx->me_method;
4317         s->m.me.scene_change_score=0;
4318         s->m.flags= s->avctx->flags;
4319         s->m.quarter_sample= (s->avctx->flags & CODEC_FLAG_QPEL)!=0;
4320         s->m.out_format= FMT_H263;
4321         s->m.unrestricted_mv= 1;
4322
4323         s->m.lambda = s->lambda;
4324         s->m.qscale= (s->m.lambda*139 + FF_LAMBDA_SCALE*64) >> (FF_LAMBDA_SHIFT + 7);
4325         s->lambda2= s->m.lambda2= (s->m.lambda*s->m.lambda + FF_LAMBDA_SCALE/2) >> FF_LAMBDA_SHIFT;
4326
4327         s->m.dsp= s->dsp; //move
4328         ff_init_me(&s->m);
4329         s->dsp= s->m.dsp;
4330     }
4331
4332     if(s->pass1_rc){
4333         memcpy(rc_header_bak, s->header_state, sizeof(s->header_state));
4334         memcpy(rc_block_bak, s->block_state, sizeof(s->block_state));
4335     }
4336
4337 redo_frame:
4338
4339     if(pict->pict_type == I_TYPE)
4340         s->spatial_decomposition_count= 5;
4341     else
4342         s->spatial_decomposition_count= 5;
4343
4344     s->m.pict_type = pict->pict_type;
4345     s->qbias= pict->pict_type == P_TYPE ? 2 : 0;
4346
4347     common_init_after_header(avctx);
4348
4349     if(s->last_spatial_decomposition_count != s->spatial_decomposition_count){
4350         for(plane_index=0; plane_index<3; plane_index++){
4351             calculate_visual_weight(s, &s->plane[plane_index]);
4352         }
4353     }
4354
4355     encode_header(s);
4356     s->m.misc_bits = 8*(s->c.bytestream - s->c.bytestream_start);
4357     encode_blocks(s, 1);
4358     s->m.mv_bits = 8*(s->c.bytestream - s->c.bytestream_start) - s->m.misc_bits;
4359
4360     for(plane_index=0; plane_index<3; plane_index++){
4361         Plane *p= &s->plane[plane_index];
4362         int w= p->width;
4363         int h= p->height;
4364         int x, y;
4365 //        int bits= put_bits_count(&s->c.pb);
4366
4367     if(!(avctx->flags2 & CODEC_FLAG2_MEMC_ONLY)){
4368         //FIXME optimize
4369      if(pict->data[plane_index]) //FIXME gray hack
4370         for(y=0; y<h; y++){
4371             for(x=0; x<w; x++){
4372                 s->spatial_idwt_buffer[y*w + x]= pict->data[plane_index][y*pict->linesize[plane_index] + x]<<FRAC_BITS;
4373             }
4374         }
4375         predict_plane(s, s->spatial_idwt_buffer, plane_index, 0);
4376
4377         if(   plane_index==0
4378            && pict->pict_type == P_TYPE
4379            && !(avctx->flags&CODEC_FLAG_PASS2)
4380            && s->m.me.scene_change_score > s->avctx->scenechange_threshold){
4381             ff_init_range_encoder(c, buf, buf_size);
4382             ff_build_rac_states(c, 0.05*(1LL<<32), 256-8);
4383             pict->pict_type= FF_I_TYPE;
4384             s->keyframe=1;
4385             s->current_picture.key_frame=1;
4386             goto redo_frame;
4387         }
4388
4389         if(s->qlog == LOSSLESS_QLOG){
4390             for(y=0; y<h; y++){
4391                 for(x=0; x<w; x++){
4392                     s->spatial_dwt_buffer[y*w + x]= (s->spatial_idwt_buffer[y*w + x] + (1<<(FRAC_BITS-1))-1)>>FRAC_BITS;
4393                 }
4394             }
4395         }else{
4396             for(y=0; y<h; y++){
4397                 for(x=0; x<w; x++){
4398                     s->spatial_dwt_buffer[y*w + x]=s->spatial_idwt_buffer[y*w + x]<<ENCODER_EXTRA_BITS;
4399                 }
4400             }
4401         }
4402
4403         if(QUANTIZE2)
4404             dwt_quantize(s, p, s->spatial_dwt_buffer, w, h, w, s->spatial_decomposition_type);
4405         else
4406             ff_spatial_dwt(s->spatial_dwt_buffer, w, h, w, s->spatial_decomposition_type, s->spatial_decomposition_count);
4407
4408         if(s->pass1_rc && plane_index==0){
4409             int delta_qlog = ratecontrol_1pass(s, pict);
4410             if (delta_qlog <= INT_MIN)
4411                 return -1;
4412             if(delta_qlog){
4413                 //reordering qlog in the bitstream would eliminate this reset
4414                 ff_init_range_encoder(c, buf, buf_size);
4415                 memcpy(s->header_state, rc_header_bak, sizeof(s->header_state));
4416                 memcpy(s->block_state, rc_block_bak, sizeof(s->block_state));
4417                 encode_header(s);
4418                 encode_blocks(s, 0);
4419             }
4420         }
4421
4422         for(level=0; level<s->spatial_decomposition_count; level++){
4423             for(orientation=level ? 1 : 0; orientation<4; orientation++){
4424                 SubBand *b= &p->band[level][orientation];
4425
4426                 if(!QUANTIZE2)
4427                     quantize(s, b, b->ibuf, b->buf, b->stride, s->qbias);
4428                 if(orientation==0)
4429                     decorrelate(s, b, b->ibuf, b->stride, pict->pict_type == P_TYPE, 0);
4430                 encode_subband(s, b, b->ibuf, b->parent ? b->parent->ibuf : NULL, b->stride, orientation);
4431                 assert(b->parent==NULL || b->parent->stride == b->stride*2);
4432                 if(orientation==0)
4433                     correlate(s, b, b->ibuf, b->stride, 1, 0);
4434             }
4435         }
4436 //        av_log(NULL, AV_LOG_DEBUG, "plane:%d bits:%d\n", plane_index, put_bits_count(&s->c.pb) - bits);
4437
4438         for(level=0; level<s->spatial_decomposition_count; level++){
4439             for(orientation=level ? 1 : 0; orientation<4; orientation++){
4440                 SubBand *b= &p->band[level][orientation];
4441
4442                 dequantize(s, b, b->ibuf, b->stride);
4443             }
4444         }
4445
4446         ff_spatial_idwt(s->spatial_idwt_buffer, w, h, w, s->spatial_decomposition_type, s->spatial_decomposition_count);
4447         if(s->qlog == LOSSLESS_QLOG){
4448             for(y=0; y<h; y++){
4449                 for(x=0; x<w; x++){
4450                     s->spatial_idwt_buffer[y*w + x]<<=FRAC_BITS;
4451                 }
4452             }
4453         }
4454 {START_TIMER
4455         predict_plane(s, s->spatial_idwt_buffer, plane_index, 1);
4456 STOP_TIMER("pred-conv")}
4457       }else{
4458             //ME/MC only
4459             if(pict->pict_type == I_TYPE){
4460                 for(y=0; y<h; y++){
4461                     for(x=0; x<w; x++){
4462                         s->current_picture.data[plane_index][y*s->current_picture.linesize[plane_index] + x]=
4463                             pict->data[plane_index][y*pict->linesize[plane_index] + x];
4464                     }
4465                 }
4466             }else{
4467                 memset(s->spatial_idwt_buffer, 0, sizeof(IDWTELEM)*w*h);
4468                 predict_plane(s, s->spatial_idwt_buffer, plane_index, 1);
4469             }
4470       }
4471         if(s->avctx->flags&CODEC_FLAG_PSNR){
4472             int64_t error= 0;
4473
4474     if(pict->data[plane_index]) //FIXME gray hack
4475             for(y=0; y<h; y++){
4476                 for(x=0; x<w; x++){
4477                     int d= s->current_picture.data[plane_index][y*s->current_picture.linesize[plane_index] + x] - pict->data[plane_index][y*pict->linesize[plane_index] + x];
4478                     error += d*d;
4479                 }
4480             }
4481             s->avctx->error[plane_index] += error;
4482             s->current_picture.error[plane_index] = error;
4483         }
4484     }
4485
4486     update_last_header_values(s);
4487
4488     if(s->last_picture[s->max_ref_frames-1].data[0]){
4489         avctx->release_buffer(avctx, &s->last_picture[s->max_ref_frames-1]);
4490         for(i=0; i<9; i++)
4491             if(s->halfpel_plane[s->max_ref_frames-1][1+i/3][i%3])
4492                 av_free(s->halfpel_plane[s->max_ref_frames-1][1+i/3][i%3] - EDGE_WIDTH*(1+s->current_picture.linesize[i%3]));
4493     }
4494
4495     s->current_picture.coded_picture_number = avctx->frame_number;
4496     s->current_picture.pict_type = pict->pict_type;
4497     s->current_picture.quality = pict->quality;
4498     s->m.frame_bits = 8*(s->c.bytestream - s->c.bytestream_start);
4499     s->m.p_tex_bits = s->m.frame_bits - s->m.misc_bits - s->m.mv_bits;
4500     s->m.current_picture.display_picture_number =
4501     s->m.current_picture.coded_picture_number = avctx->frame_number;
4502     s->m.current_picture.quality = pict->quality;
4503     s->m.total_bits += 8*(s->c.bytestream - s->c.bytestream_start);
4504     if(s->pass1_rc)
4505         if (ff_rate_estimate_qscale(&s->m, 0) < 0)
4506             return -1;
4507     if(avctx->flags&CODEC_FLAG_PASS1)
4508         ff_write_pass1_stats(&s->m);
4509     s->m.last_pict_type = s->m.pict_type;
4510     avctx->frame_bits = s->m.frame_bits;
4511     avctx->mv_bits = s->m.mv_bits;
4512     avctx->misc_bits = s->m.misc_bits;
4513     avctx->p_tex_bits = s->m.p_tex_bits;
4514
4515     emms_c();
4516
4517     return ff_rac_terminate(c);
4518 }
4519
4520 static void common_end(SnowContext *s){
4521     int plane_index, level, orientation, i;
4522
4523     av_freep(&s->spatial_dwt_buffer);
4524     av_freep(&s->spatial_idwt_buffer);
4525
4526     av_freep(&s->m.me.scratchpad);
4527     av_freep(&s->m.me.map);
4528     av_freep(&s->m.me.score_map);
4529     av_freep(&s->m.obmc_scratchpad);
4530
4531     av_freep(&s->block);
4532
4533     for(i=0; i<MAX_REF_FRAMES; i++){
4534         av_freep(&s->ref_mvs[i]);
4535         av_freep(&s->ref_scores[i]);
4536         if(s->last_picture[i].data[0])
4537             s->avctx->release_buffer(s->avctx, &s->last_picture[i]);
4538     }
4539
4540     for(plane_index=0; plane_index<3; plane_index++){
4541         for(level=s->spatial_decomposition_count-1; level>=0; level--){
4542             for(orientation=level ? 1 : 0; orientation<4; orientation++){
4543                 SubBand *b= &s->plane[plane_index].band[level][orientation];
4544
4545                 av_freep(&b->x_coeff);
4546             }
4547         }
4548     }
4549 }
4550
4551 static int encode_end(AVCodecContext *avctx)
4552 {
4553     SnowContext *s = avctx->priv_data;
4554
4555     common_end(s);
4556     av_free(avctx->stats_out);
4557
4558     return 0;
4559 }
4560
4561 static int decode_init(AVCodecContext *avctx)
4562 {
4563     avctx->pix_fmt= PIX_FMT_YUV420P;
4564
4565     common_init(avctx);
4566
4567     return 0;
4568 }
4569
4570 static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, uint8_t *buf, int buf_size){
4571     SnowContext *s = avctx->priv_data;
4572     RangeCoder * const c= &s->c;
4573     int bytes_read;
4574     AVFrame *picture = data;
4575     int level, orientation, plane_index, i;
4576
4577     ff_init_range_decoder(c, buf, buf_size);
4578     ff_build_rac_states(c, 0.05*(1LL<<32), 256-8);
4579
4580     s->current_picture.pict_type= FF_I_TYPE; //FIXME I vs. P
4581     if(decode_header(s)<0)
4582         return -1;
4583     common_init_after_header(avctx);
4584
4585     // realloc slice buffer for the case that spatial_decomposition_count changed
4586     slice_buffer_destroy(&s->sb);
4587     slice_buffer_init(&s->sb, s->plane[0].height, (MB_SIZE >> s->block_max_depth) + s->spatial_decomposition_count * 8 + 1, s->plane[0].width, s->spatial_idwt_buffer);
4588
4589     for(plane_index=0; plane_index<3; plane_index++){
4590         Plane *p= &s->plane[plane_index];
4591         p->fast_mc= p->diag_mc && p->htaps==6 && p->hcoeff[0]==40
4592                                               && p->hcoeff[1]==-10
4593                                               && p->hcoeff[2]==2;
4594     }
4595
4596     if(!s->block) alloc_blocks(s);
4597
4598     frame_start(s);
4599     //keyframe flag dupliaction mess FIXME
4600     if(avctx->debug&FF_DEBUG_PICT_INFO)
4601         av_log(avctx, AV_LOG_ERROR, "keyframe:%d qlog:%d\n", s->keyframe, s->qlog);
4602
4603     decode_blocks(s);
4604
4605     for(plane_index=0; plane_index<3; plane_index++){
4606         Plane *p= &s->plane[plane_index];
4607         int w= p->width;
4608         int h= p->height;
4609         int x, y;
4610         int decode_state[MAX_DECOMPOSITIONS][4][1]; /* Stored state info for unpack_coeffs. 1 variable per instance. */
4611
4612 if(s->avctx->debug&2048){
4613         memset(s->spatial_dwt_buffer, 0, sizeof(DWTELEM)*w*h);
4614         predict_plane(s, s->spatial_idwt_buffer, plane_index, 1);
4615
4616         for(y=0; y<h; y++){
4617             for(x=0; x<w; x++){
4618                 int v= s->current_picture.data[plane_index][y*s->current_picture.linesize[plane_index] + x];
4619                 s->mconly_picture.data[plane_index][y*s->mconly_picture.linesize[plane_index] + x]= v;
4620             }
4621         }
4622 }
4623
4624 {   START_TIMER
4625     for(level=0; level<s->spatial_decomposition_count; level++){
4626         for(orientation=level ? 1 : 0; orientation<4; orientation++){
4627             SubBand *b= &p->band[level][orientation];
4628             unpack_coeffs(s, b, b->parent, orientation);
4629         }
4630     }
4631     STOP_TIMER("unpack coeffs");
4632 }
4633
4634 {START_TIMER
4635     const int mb_h= s->b_height << s->block_max_depth;
4636     const int block_size = MB_SIZE >> s->block_max_depth;
4637     const int block_w    = plane_index ? block_size/2 : block_size;
4638     int mb_y;
4639     dwt_compose_t cs[MAX_DECOMPOSITIONS];
4640     int yd=0, yq=0;
4641     int y;
4642     int end_y;
4643
4644     ff_spatial_idwt_buffered_init(cs, &s->sb, w, h, 1, s->spatial_decomposition_type, s->spatial_decomposition_count);
4645     for(mb_y=0; mb_y<=mb_h; mb_y++){
4646
4647         int slice_starty = block_w*mb_y;
4648         int slice_h = block_w*(mb_y+1);
4649         if (!(s->keyframe || s->avctx->debug&512)){
4650             slice_starty = FFMAX(0, slice_starty - (block_w >> 1));
4651             slice_h -= (block_w >> 1);
4652         }
4653
4654         {
4655         START_TIMER
4656         for(level=0; level<s->spatial_decomposition_count; level++){
4657             for(orientation=level ? 1 : 0; orientation<4; orientation++){
4658                 SubBand *b= &p->band[level][orientation];
4659                 int start_y;
4660                 int end_y;
4661                 int our_mb_start = mb_y;
4662                 int our_mb_end = (mb_y + 1);
4663                 const int extra= 3;
4664                 start_y = (mb_y ? ((block_w * our_mb_start) >> (s->spatial_decomposition_count - level)) + s->spatial_decomposition_count - level + extra: 0);
4665                 end_y = (((block_w * our_mb_end) >> (s->spatial_decomposition_count - level)) + s->spatial_decomposition_count - level + extra);
4666                 if (!(s->keyframe || s->avctx->debug&512)){
4667                     start_y = FFMAX(0, start_y - (block_w >> (1+s->spatial_decomposition_count - level)));
4668                     end_y = FFMAX(0, end_y - (block_w >> (1+s->spatial_decomposition_count - level)));
4669                 }
4670                 start_y = FFMIN(b->height, start_y);
4671                 end_y = FFMIN(b->height, end_y);
4672
4673                 if (start_y != end_y){
4674                     if (orientation == 0){
4675                         SubBand * correlate_band = &p->band[0][0];
4676                         int correlate_end_y = FFMIN(b->height, end_y + 1);
4677                         int correlate_start_y = FFMIN(b->height, (start_y ? start_y + 1 : 0));
4678                         decode_subband_slice_buffered(s, correlate_band, &s->sb, correlate_start_y, correlate_end_y, decode_state[0][0]);
4679                         correlate_slice_buffered(s, &s->sb, correlate_band, correlate_band->ibuf, correlate_band->stride, 1, 0, correlate_start_y, correlate_end_y);
4680                         dequantize_slice_buffered(s, &s->sb, correlate_band, correlate_band->ibuf, correlate_band->stride, start_y, end_y);
4681                     }
4682                     else
4683                         decode_subband_slice_buffered(s, b, &s->sb, start_y, end_y, decode_state[level][orientation]);
4684                 }
4685             }
4686         }
4687         STOP_TIMER("decode_subband_slice");
4688         }
4689
4690 {   START_TIMER
4691         for(; yd<slice_h; yd+=4){
4692             ff_spatial_idwt_buffered_slice(&s->dsp, cs, &s->sb, w, h, 1, s->spatial_decomposition_type, s->spatial_decomposition_count, yd);
4693         }
4694     STOP_TIMER("idwt slice");}
4695
4696
4697         if(s->qlog == LOSSLESS_QLOG){
4698             for(; yq<slice_h && yq<h; yq++){
4699                 IDWTELEM * line = slice_buffer_get_line(&s->sb, yq);
4700                 for(x=0; x<w; x++){
4701                     line[x] <<= FRAC_BITS;
4702                 }
4703             }
4704         }
4705
4706         predict_slice_buffered(s, &s->sb, s->spatial_idwt_buffer, plane_index, 1, mb_y);
4707
4708         y = FFMIN(p->height, slice_starty);
4709         end_y = FFMIN(p->height, slice_h);
4710         while(y < end_y)
4711             slice_buffer_release(&s->sb, y++);
4712     }
4713
4714     slice_buffer_flush(&s->sb);
4715
4716 STOP_TIMER("idwt + predict_slices")}
4717     }
4718
4719     emms_c();
4720
4721     if(s->last_picture[s->max_ref_frames-1].data[0]){
4722         avctx->release_buffer(avctx, &s->last_picture[s->max_ref_frames-1]);
4723         for(i=0; i<9; i++)
4724             if(s->halfpel_plane[s->max_ref_frames-1][1+i/3][i%3])
4725                 av_free(s->halfpel_plane[s->max_ref_frames-1][1+i/3][i%3] - EDGE_WIDTH*(1+s->current_picture.linesize[i%3]));
4726     }
4727
4728 if(!(s->avctx->debug&2048))
4729     *picture= s->current_picture;
4730 else
4731     *picture= s->mconly_picture;
4732
4733     *data_size = sizeof(AVFrame);
4734
4735     bytes_read= c->bytestream - c->bytestream_start;
4736     if(bytes_read ==0) av_log(s->avctx, AV_LOG_ERROR, "error at end of frame\n"); //FIXME
4737
4738     return bytes_read;
4739 }
4740
4741 static int decode_end(AVCodecContext *avctx)
4742 {
4743     SnowContext *s = avctx->priv_data;
4744
4745     slice_buffer_destroy(&s->sb);
4746
4747     common_end(s);
4748
4749     return 0;
4750 }
4751
4752 AVCodec snow_decoder = {
4753     "snow",
4754     CODEC_TYPE_VIDEO,
4755     CODEC_ID_SNOW,
4756     sizeof(SnowContext),
4757     decode_init,
4758     NULL,
4759     decode_end,
4760     decode_frame,
4761     0 /*CODEC_CAP_DR1*/ /*| CODEC_CAP_DRAW_HORIZ_BAND*/,
4762     NULL
4763 };
4764
4765 #ifdef CONFIG_SNOW_ENCODER
4766 AVCodec snow_encoder = {
4767     "snow",
4768     CODEC_TYPE_VIDEO,
4769     CODEC_ID_SNOW,
4770     sizeof(SnowContext),
4771     encode_init,
4772     encode_frame,
4773     encode_end,
4774 };
4775 #endif
4776
4777
4778 #ifdef TEST
4779 #undef malloc
4780 #undef free
4781 #undef printf
4782 #undef random
4783
4784 int main(void){
4785     int width=256;
4786     int height=256;
4787     int buffer[2][width*height];
4788     SnowContext s;
4789     int i;
4790     s.spatial_decomposition_count=6;
4791     s.spatial_decomposition_type=1;
4792
4793     printf("testing 5/3 DWT\n");
4794     for(i=0; i<width*height; i++)
4795         buffer[0][i]= buffer[1][i]= random()%54321 - 12345;
4796
4797     ff_spatial_dwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4798     ff_spatial_idwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4799
4800     for(i=0; i<width*height; i++)
4801         if(buffer[0][i]!= buffer[1][i]) printf("fsck: %d %d %d\n",i, buffer[0][i], buffer[1][i]);
4802
4803     printf("testing 9/7 DWT\n");
4804     s.spatial_decomposition_type=0;
4805     for(i=0; i<width*height; i++)
4806         buffer[0][i]= buffer[1][i]= random()%54321 - 12345;
4807
4808     ff_spatial_dwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4809     ff_spatial_idwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4810
4811     for(i=0; i<width*height; i++)
4812         if(FFABS(buffer[0][i] - buffer[1][i])>20) printf("fsck: %d %d %d\n",i, buffer[0][i], buffer[1][i]);
4813
4814 #if 0
4815     printf("testing AC coder\n");
4816     memset(s.header_state, 0, sizeof(s.header_state));
4817     ff_init_range_encoder(&s.c, buffer[0], 256*256);
4818     ff_init_cabac_states(&s.c, ff_h264_lps_range, ff_h264_mps_state, ff_h264_lps_state, 64);
4819
4820     for(i=-256; i<256; i++){
4821 START_TIMER
4822         put_symbol(&s.c, s.header_state, i*i*i/3*FFABS(i), 1);
4823 STOP_TIMER("put_symbol")
4824     }
4825     ff_rac_terminate(&s.c);
4826
4827     memset(s.header_state, 0, sizeof(s.header_state));
4828     ff_init_range_decoder(&s.c, buffer[0], 256*256);
4829     ff_init_cabac_states(&s.c, ff_h264_lps_range, ff_h264_mps_state, ff_h264_lps_state, 64);
4830
4831     for(i=-256; i<256; i++){
4832         int j;
4833 START_TIMER
4834         j= get_symbol(&s.c, s.header_state, 1);
4835 STOP_TIMER("get_symbol")
4836         if(j!=i*i*i/3*FFABS(i)) printf("fsck: %d != %d\n", i, j);
4837     }
4838 #endif
4839 {
4840 int level, orientation, x, y;
4841 int64_t errors[8][4];
4842 int64_t g=0;
4843
4844     memset(errors, 0, sizeof(errors));
4845     s.spatial_decomposition_count=3;
4846     s.spatial_decomposition_type=0;
4847     for(level=0; level<s.spatial_decomposition_count; level++){
4848         for(orientation=level ? 1 : 0; orientation<4; orientation++){
4849             int w= width  >> (s.spatial_decomposition_count-level);
4850             int h= height >> (s.spatial_decomposition_count-level);
4851             int stride= width  << (s.spatial_decomposition_count-level);
4852             DWTELEM *buf= buffer[0];
4853             int64_t error=0;
4854
4855             if(orientation&1) buf+=w;
4856             if(orientation>1) buf+=stride>>1;
4857
4858             memset(buffer[0], 0, sizeof(int)*width*height);
4859             buf[w/2 + h/2*stride]= 256*256;
4860             ff_spatial_idwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4861             for(y=0; y<height; y++){
4862                 for(x=0; x<width; x++){
4863                     int64_t d= buffer[0][x + y*width];
4864                     error += d*d;
4865                     if(FFABS(width/2-x)<9 && FFABS(height/2-y)<9 && level==2) printf("%8"PRId64" ", d);
4866                 }
4867                 if(FFABS(height/2-y)<9 && level==2) printf("\n");
4868             }
4869             error= (int)(sqrt(error)+0.5);
4870             errors[level][orientation]= error;
4871             if(g) g=ff_gcd(g, error);
4872             else g= error;
4873         }
4874     }
4875     printf("static int const visual_weight[][4]={\n");
4876     for(level=0; level<s.spatial_decomposition_count; level++){
4877         printf("  {");
4878         for(orientation=0; orientation<4; orientation++){
4879             printf("%8"PRId64",", errors[level][orientation]/g);
4880         }
4881         printf("},\n");
4882     }
4883     printf("};\n");
4884     {
4885             int level=2;
4886             int orientation=3;
4887             int w= width  >> (s.spatial_decomposition_count-level);
4888             int h= height >> (s.spatial_decomposition_count-level);
4889             int stride= width  << (s.spatial_decomposition_count-level);
4890             DWTELEM *buf= buffer[0];
4891             int64_t error=0;
4892
4893             buf+=w;
4894             buf+=stride>>1;
4895
4896             memset(buffer[0], 0, sizeof(int)*width*height);
4897 #if 1
4898             for(y=0; y<height; y++){
4899                 for(x=0; x<width; x++){
4900                     int tab[4]={0,2,3,1};
4901                     buffer[0][x+width*y]= 256*256*tab[(x&1) + 2*(y&1)];
4902                 }
4903             }
4904             ff_spatial_dwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4905 #else
4906             for(y=0; y<h; y++){
4907                 for(x=0; x<w; x++){
4908                     buf[x + y*stride  ]=169;
4909                     buf[x + y*stride-w]=64;
4910                 }
4911             }
4912             ff_spatial_idwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4913 #endif
4914             for(y=0; y<height; y++){
4915                 for(x=0; x<width; x++){
4916                     int64_t d= buffer[0][x + y*width];
4917                     error += d*d;
4918                     if(FFABS(width/2-x)<9 && FFABS(height/2-y)<9) printf("%8"PRId64" ", d);
4919                 }
4920                 if(FFABS(height/2-y)<9) printf("\n");
4921             }
4922     }
4923
4924 }
4925     return 0;
4926 }
4927 #endif /* TEST */