libavcodec/snow.c

   1 /*
   2  * Copyright (C) 2004 Michael Niedermayer <michaelni@gmx.at>
   3  *
   4  * This file is part of FFmpeg.
   5  *
   6  * FFmpeg is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * FFmpeg is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with FFmpeg; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 #include "avcodec.h"
  22 #include "dsputil.h"
  23 #include "snow.h"
  24
  25 #include "rangecoder.h"
  26
  27 #include "mpegvideo.h"
  28
  29 #undef NDEBUG
  30 #include <assert.h>
  31
  32 static const int8_t quant3[256]={
  33  0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  34  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  35  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  36  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  37  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  38  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  39  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  40  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  41 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  42 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  43 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  44 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  45 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  46 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  47 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  48 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, 0,
  49 };
  50 static const int8_t quant3b[256]={
  51  0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  52  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  53  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  54  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  55  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  56  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  57  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  58  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  59 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  60 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  61 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  62 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  63 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  64 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  65 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  66 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
  67 };
  68 static const int8_t quant3bA[256]={
  69  0, 0, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  70  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  71  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  72  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  73  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  74  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  75  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  76  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  77  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  78  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  79  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  80  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  81  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  82  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  83  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  84  1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1, 1,-1,
  85 };
  86 static const int8_t quant5[256]={
  87  0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  88  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  89  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  90  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  91  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  92  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  93  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  94  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  95 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
  96 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
  97 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
  98 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
  99 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
 100 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
 101 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
 102 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-1,-1,-1,
 103 };
 104 static const int8_t quant7[256]={
 105  0, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 106  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 107  2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
 108  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 109  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 110  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 111  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 112  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 113 -3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
 114 -3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
 115 -3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
 116 -3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
 117 -3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,
 118 -3,-3,-3,-3,-3,-3,-3,-3,-3,-2,-2,-2,-2,-2,-2,-2,
 119 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,
 120 -2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,-1,-1,
 121 };
 122 static const int8_t quant9[256]={
 123  0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
 124  3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 125  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 126  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 127  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 128  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 129  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 130  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 131 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 132 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 133 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 134 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 135 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 136 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 137 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-3,-3,-3,-3,
 138 -3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-2,-2,-2,-2,-1,-1,
 139 };
 140 static const int8_t quant11[256]={
 141  0, 1, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4,
 142  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
 143  4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 144  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 145  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 146  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 147  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 148  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 149 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 150 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 151 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 152 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 153 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 154 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-4,-4,
 155 -4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,-4,
 156 -4,-4,-4,-4,-4,-3,-3,-3,-3,-3,-3,-3,-2,-2,-2,-1,
 157 };
 158 static const int8_t quant13[256]={
 159  0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
 160  4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 161  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 162  5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
 163  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
 164  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
 165  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
 166  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
 167 -6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
 168 -6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
 169 -6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
 170 -6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,
 171 -6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-6,-5,
 172 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 173 -5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,-5,
 174 -4,-4,-4,-4,-4,-4,-4,-4,-4,-3,-3,-3,-3,-2,-2,-1,
 175 };
 176
 177 #if 0 //64*cubic
 178 static const uint8_t obmc32[1024]={
 179  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 180  0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
 181  0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
 182  0, 0, 1, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 7, 8, 8, 8, 8, 7, 7, 6, 6, 5, 4, 4, 3, 2, 2, 1, 1, 0, 0,
 183  0, 0, 1, 2, 2, 3, 4, 6, 7, 8, 9,10,11,12,12,12,12,12,12,11,10, 9, 8, 7, 6, 4, 3, 2, 2, 1, 0, 0,
 184  0, 1, 1, 2, 3, 5, 6, 8,10,11,13,14,15,16,17,18,18,17,16,15,14,13,11,10, 8, 6, 5, 3, 2, 1, 1, 0,
 185  0, 1, 1, 3, 4, 6, 8,10,13,15,17,19,20,22,22,23,23,22,22,20,19,17,15,13,10, 8, 6, 4, 3, 1, 1, 0,
 186  0, 1, 2, 4, 6, 8,10,13,16,19,21,23,25,27,28,29,29,28,27,25,23,21,19,16,13,10, 8, 6, 4, 2, 1, 0,
 187  0, 1, 2, 4, 7,10,13,16,19,22,25,28,31,33,34,35,35,34,33,31,28,25,22,19,16,13,10, 7, 4, 2, 1, 0,
 188  0, 1, 3, 5, 8,11,15,19,22,26,30,33,36,38,40,41,41,40,38,36,33,30,26,22,19,15,11, 8, 5, 3, 1, 0,
 189  0, 1, 3, 6, 9,12,17,21,25,30,34,38,41,44,45,46,46,45,44,41,38,34,30,25,21,17,12, 9, 6, 3, 1, 0,
 190  0, 1, 3, 6,10,14,19,23,28,33,38,42,45,48,51,52,52,51,48,45,42,38,33,28,23,19,14,10, 6, 3, 1, 0,
 191  0, 1, 4, 7,11,15,20,25,31,36,41,45,49,52,55,56,56,55,52,49,45,41,36,31,25,20,15,11, 7, 4, 1, 0,
 192  0, 2, 4, 7,12,16,22,27,33,38,44,48,52,56,58,60,60,58,56,52,48,44,38,33,27,22,16,12, 7, 4, 2, 0,
 193  0, 1, 4, 8,12,17,22,28,34,40,45,51,55,58,61,62,62,61,58,55,51,45,40,34,28,22,17,12, 8, 4, 1, 0,
 194  0, 2, 4, 8,12,18,23,29,35,41,46,52,56,60,62,64,64,62,60,56,52,46,41,35,29,23,18,12, 8, 4, 2, 0,
 195  0, 2, 4, 8,12,18,23,29,35,41,46,52,56,60,62,64,64,62,60,56,52,46,41,35,29,23,18,12, 8, 4, 2, 0,
 196  0, 1, 4, 8,12,17,22,28,34,40,45,51,55,58,61,62,62,61,58,55,51,45,40,34,28,22,17,12, 8, 4, 1, 0,
 197  0, 2, 4, 7,12,16,22,27,33,38,44,48,52,56,58,60,60,58,56,52,48,44,38,33,27,22,16,12, 7, 4, 2, 0,
 198  0, 1, 4, 7,11,15,20,25,31,36,41,45,49,52,55,56,56,55,52,49,45,41,36,31,25,20,15,11, 7, 4, 1, 0,
 199  0, 1, 3, 6,10,14,19,23,28,33,38,42,45,48,51,52,52,51,48,45,42,38,33,28,23,19,14,10, 6, 3, 1, 0,
 200  0, 1, 3, 6, 9,12,17,21,25,30,34,38,41,44,45,46,46,45,44,41,38,34,30,25,21,17,12, 9, 6, 3, 1, 0,
 201  0, 1, 3, 5, 8,11,15,19,22,26,30,33,36,38,40,41,41,40,38,36,33,30,26,22,19,15,11, 8, 5, 3, 1, 0,
 202  0, 1, 2, 4, 7,10,13,16,19,22,25,28,31,33,34,35,35,34,33,31,28,25,22,19,16,13,10, 7, 4, 2, 1, 0,
 203  0, 1, 2, 4, 6, 8,10,13,16,19,21,23,25,27,28,29,29,28,27,25,23,21,19,16,13,10, 8, 6, 4, 2, 1, 0,
 204  0, 1, 1, 3, 4, 6, 8,10,13,15,17,19,20,22,22,23,23,22,22,20,19,17,15,13,10, 8, 6, 4, 3, 1, 1, 0,
 205  0, 1, 1, 2, 3, 5, 6, 8,10,11,13,14,15,16,17,18,18,17,16,15,14,13,11,10, 8, 6, 5, 3, 2, 1, 1, 0,
 206  0, 0, 1, 2, 2, 3, 4, 6, 7, 8, 9,10,11,12,12,12,12,12,12,11,10, 9, 8, 7, 6, 4, 3, 2, 2, 1, 0, 0,
 207  0, 0, 1, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 7, 8, 8, 8, 8, 7, 7, 6, 6, 5, 4, 4, 3, 2, 2, 1, 1, 0, 0,
 208  0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
 209  0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
 210  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 211 //error:0.000022
 212 };
 213 static const uint8_t obmc16[256]={
 214  0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
 215  0, 1, 1, 2, 4, 5, 5, 6, 6, 5, 5, 4, 2, 1, 1, 0,
 216  0, 1, 4, 6, 9,11,13,15,15,13,11, 9, 6, 4, 1, 0,
 217  0, 2, 6,11,15,20,24,26,26,24,20,15,11, 6, 2, 0,
 218  0, 4, 9,15,23,29,34,38,38,34,29,23,15, 9, 4, 0,
 219  0, 5,11,20,29,38,45,49,49,45,38,29,20,11, 5, 0,
 220  1, 5,13,24,34,45,53,57,57,53,45,34,24,13, 5, 1,
 221  1, 6,15,26,38,49,57,62,62,57,49,38,26,15, 6, 1,
 222  1, 6,15,26,38,49,57,62,62,57,49,38,26,15, 6, 1,
 223  1, 5,13,24,34,45,53,57,57,53,45,34,24,13, 5, 1,
 224  0, 5,11,20,29,38,45,49,49,45,38,29,20,11, 5, 0,
 225  0, 4, 9,15,23,29,34,38,38,34,29,23,15, 9, 4, 0,
 226  0, 2, 6,11,15,20,24,26,26,24,20,15,11, 6, 2, 0,
 227  0, 1, 4, 6, 9,11,13,15,15,13,11, 9, 6, 4, 1, 0,
 228  0, 1, 1, 2, 4, 5, 5, 6, 6, 5, 5, 4, 2, 1, 1, 0,
 229  0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
 230 //error:0.000033
 231 };
 232 #elif 1 // 64*linear
 233 static const uint8_t obmc32[1024]={
 234   0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  8,  8,  8,  8,  8,  8,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,
 235   0,  4,  4,  4,  8,  8,  8, 12, 12, 16, 16, 16, 20, 20, 20, 24, 24, 20, 20, 20, 16, 16, 16, 12, 12,  8,  8,  8,  4,  4,  4,  0,
 236   0,  4,  8,  8, 12, 12, 16, 20, 20, 24, 28, 28, 32, 32, 36, 40, 40, 36, 32, 32, 28, 28, 24, 20, 20, 16, 12, 12,  8,  8,  4,  0,
 237   0,  4,  8, 12, 16, 20, 24, 28, 28, 32, 36, 40, 44, 48, 52, 56, 56, 52, 48, 44, 40, 36, 32, 28, 28, 24, 20, 16, 12,  8,  4,  0,
 238   4,  8, 12, 16, 20, 24, 28, 32, 40, 44, 48, 52, 56, 60, 64, 68, 68, 64, 60, 56, 52, 48, 44, 40, 32, 28, 24, 20, 16, 12,  8,  4,
 239   4,  8, 12, 20, 24, 32, 36, 40, 48, 52, 56, 64, 68, 76, 80, 84, 84, 80, 76, 68, 64, 56, 52, 48, 40, 36, 32, 24, 20, 12,  8,  4,
 240   4,  8, 16, 24, 28, 36, 44, 48, 56, 60, 68, 76, 80, 88, 96,100,100, 96, 88, 80, 76, 68, 60, 56, 48, 44, 36, 28, 24, 16,  8,  4,
 241   4, 12, 20, 28, 32, 40, 48, 56, 64, 72, 80, 88, 92,100,108,116,116,108,100, 92, 88, 80, 72, 64, 56, 48, 40, 32, 28, 20, 12,  4,
 242   4, 12, 20, 28, 40, 48, 56, 64, 72, 80, 88, 96,108,116,124,132,132,124,116,108, 96, 88, 80, 72, 64, 56, 48, 40, 28, 20, 12,  4,
 243   4, 16, 24, 32, 44, 52, 60, 72, 80, 92,100,108,120,128,136,148,148,136,128,120,108,100, 92, 80, 72, 60, 52, 44, 32, 24, 16,  4,
 244   4, 16, 28, 36, 48, 56, 68, 80, 88,100,112,120,132,140,152,164,164,152,140,132,120,112,100, 88, 80, 68, 56, 48, 36, 28, 16,  4,
 245   4, 16, 28, 40, 52, 64, 76, 88, 96,108,120,132,144,156,168,180,180,168,156,144,132,120,108, 96, 88, 76, 64, 52, 40, 28, 16,  4,
 246   8, 20, 32, 44, 56, 68, 80, 92,108,120,132,144,156,168,180,192,192,180,168,156,144,132,120,108, 92, 80, 68, 56, 44, 32, 20,  8,
 247   8, 20, 32, 48, 60, 76, 88,100,116,128,140,156,168,184,196,208,208,196,184,168,156,140,128,116,100, 88, 76, 60, 48, 32, 20,  8,
 248   8, 20, 36, 52, 64, 80, 96,108,124,136,152,168,180,196,212,224,224,212,196,180,168,152,136,124,108, 96, 80, 64, 52, 36, 20,  8,
 249   8, 24, 40, 56, 68, 84,100,116,132,148,164,180,192,208,224,240,240,224,208,192,180,164,148,132,116,100, 84, 68, 56, 40, 24,  8,
 250   8, 24, 40, 56, 68, 84,100,116,132,148,164,180,192,208,224,240,240,224,208,192,180,164,148,132,116,100, 84, 68, 56, 40, 24,  8,
 251   8, 20, 36, 52, 64, 80, 96,108,124,136,152,168,180,196,212,224,224,212,196,180,168,152,136,124,108, 96, 80, 64, 52, 36, 20,  8,
 252   8, 20, 32, 48, 60, 76, 88,100,116,128,140,156,168,184,196,208,208,196,184,168,156,140,128,116,100, 88, 76, 60, 48, 32, 20,  8,
 253   8, 20, 32, 44, 56, 68, 80, 92,108,120,132,144,156,168,180,192,192,180,168,156,144,132,120,108, 92, 80, 68, 56, 44, 32, 20,  8,
 254   4, 16, 28, 40, 52, 64, 76, 88, 96,108,120,132,144,156,168,180,180,168,156,144,132,120,108, 96, 88, 76, 64, 52, 40, 28, 16,  4,
 255   4, 16, 28, 36, 48, 56, 68, 80, 88,100,112,120,132,140,152,164,164,152,140,132,120,112,100, 88, 80, 68, 56, 48, 36, 28, 16,  4,
 256   4, 16, 24, 32, 44, 52, 60, 72, 80, 92,100,108,120,128,136,148,148,136,128,120,108,100, 92, 80, 72, 60, 52, 44, 32, 24, 16,  4,
 257   4, 12, 20, 28, 40, 48, 56, 64, 72, 80, 88, 96,108,116,124,132,132,124,116,108, 96, 88, 80, 72, 64, 56, 48, 40, 28, 20, 12,  4,
 258   4, 12, 20, 28, 32, 40, 48, 56, 64, 72, 80, 88, 92,100,108,116,116,108,100, 92, 88, 80, 72, 64, 56, 48, 40, 32, 28, 20, 12,  4,
 259   4,  8, 16, 24, 28, 36, 44, 48, 56, 60, 68, 76, 80, 88, 96,100,100, 96, 88, 80, 76, 68, 60, 56, 48, 44, 36, 28, 24, 16,  8,  4,
 260   4,  8, 12, 20, 24, 32, 36, 40, 48, 52, 56, 64, 68, 76, 80, 84, 84, 80, 76, 68, 64, 56, 52, 48, 40, 36, 32, 24, 20, 12,  8,  4,
 261   4,  8, 12, 16, 20, 24, 28, 32, 40, 44, 48, 52, 56, 60, 64, 68, 68, 64, 60, 56, 52, 48, 44, 40, 32, 28, 24, 20, 16, 12,  8,  4,
 262   0,  4,  8, 12, 16, 20, 24, 28, 28, 32, 36, 40, 44, 48, 52, 56, 56, 52, 48, 44, 40, 36, 32, 28, 28, 24, 20, 16, 12,  8,  4,  0,
 263   0,  4,  8,  8, 12, 12, 16, 20, 20, 24, 28, 28, 32, 32, 36, 40, 40, 36, 32, 32, 28, 28, 24, 20, 20, 16, 12, 12,  8,  8,  4,  0,
 264   0,  4,  4,  4,  8,  8,  8, 12, 12, 16, 16, 16, 20, 20, 20, 24, 24, 20, 20, 20, 16, 16, 16, 12, 12,  8,  8,  8,  4,  4,  4,  0,
 265   0,  0,  0,  0,  4,  4,  4,  4,  4,  4,  4,  4,  8,  8,  8,  8,  8,  8,  8,  8,  4,  4,  4,  4,  4,  4,  4,  4,  0,  0,  0,  0,
 266  //error:0.000020
 267 };
 268 static const uint8_t obmc16[256]={
 269   0,  4,  4,  8,  8, 12, 12, 16, 16, 12, 12,  8,  8,  4,  4,  0,
 270   4,  8, 16, 20, 28, 32, 40, 44, 44, 40, 32, 28, 20, 16,  8,  4,
 271   4, 16, 24, 36, 44, 56, 64, 76, 76, 64, 56, 44, 36, 24, 16,  4,
 272   8, 20, 36, 48, 64, 76, 92,104,104, 92, 76, 64, 48, 36, 20,  8,
 273   8, 28, 44, 64, 80,100,116,136,136,116,100, 80, 64, 44, 28,  8,
 274  12, 32, 56, 76,100,120,144,164,164,144,120,100, 76, 56, 32, 12,
 275  12, 40, 64, 92,116,144,168,196,196,168,144,116, 92, 64, 40, 12,
 276  16, 44, 76,104,136,164,196,224,224,196,164,136,104, 76, 44, 16,
 277  16, 44, 76,104,136,164,196,224,224,196,164,136,104, 76, 44, 16,
 278  12, 40, 64, 92,116,144,168,196,196,168,144,116, 92, 64, 40, 12,
 279  12, 32, 56, 76,100,120,144,164,164,144,120,100, 76, 56, 32, 12,
 280   8, 28, 44, 64, 80,100,116,136,136,116,100, 80, 64, 44, 28,  8,
 281   8, 20, 36, 48, 64, 76, 92,104,104, 92, 76, 64, 48, 36, 20,  8,
 282   4, 16, 24, 36, 44, 56, 64, 76, 76, 64, 56, 44, 36, 24, 16,  4,
 283   4,  8, 16, 20, 28, 32, 40, 44, 44, 40, 32, 28, 20, 16,  8,  4,
 284   0,  4,  4,  8,  8, 12, 12, 16, 16, 12, 12,  8,  8,  4,  4,  0,
 285 //error:0.000015
 286 };
 287 #else //64*cos
 288 static const uint8_t obmc32[1024]={
 289  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 290  0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
 291  0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
 292  0, 0, 1, 1, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 6, 5, 5, 4, 3, 2, 2, 1, 1, 1, 0, 0,
 293  0, 0, 1, 1, 2, 3, 4, 5, 6, 7, 9,10,11,11,12,12,12,12,11,11,10, 9, 7, 6, 5, 4, 3, 2, 1, 1, 0, 0,
 294  0, 0, 1, 2, 3, 5, 6, 8, 9,11,12,14,15,16,17,17,17,17,16,15,14,12,11, 9, 8, 6, 5, 3, 2, 1, 0, 0,
 295  0, 1, 1, 2, 4, 6, 8,10,12,15,17,19,20,21,22,23,23,22,21,20,19,17,15,12,10, 8, 6, 4, 2, 1, 1, 0,
 296  0, 1, 2, 3, 5, 8,10,13,16,19,21,24,26,27,28,29,29,28,27,26,24,21,19,16,13,10, 8, 5, 3, 2, 1, 0,
 297  0, 1, 2, 4, 6, 9,12,16,19,23,26,29,31,33,34,35,35,34,33,31,29,26,23,19,16,12, 9, 6, 4, 2, 1, 0,
 298  0, 1, 3, 5, 7,11,15,19,23,26,30,34,37,39,40,41,41,40,39,37,34,30,26,23,19,15,11, 7, 5, 3, 1, 0,
 299  0, 1, 3, 5, 9,12,17,21,26,30,35,38,42,44,46,47,47,46,44,42,38,35,30,26,21,17,12, 9, 5, 3, 1, 0,
 300  0, 1, 3, 6, 9,14,19,24,29,34,38,43,46,49,51,52,52,51,49,46,43,38,34,29,24,19,14, 9, 6, 3, 1, 0,
 301  0, 1, 3, 6,11,15,20,26,31,37,42,46,50,53,56,57,57,56,53,50,46,42,37,31,26,20,15,11, 6, 3, 1, 0,
 302  0, 1, 3, 7,11,16,21,27,33,39,44,49,53,57,59,60,60,59,57,53,49,44,39,33,27,21,16,11, 7, 3, 1, 0,
 303  0, 1, 4, 7,12,17,22,28,34,40,46,51,56,59,61,63,63,61,59,56,51,46,40,34,28,22,17,12, 7, 4, 1, 0,
 304  0, 1, 4, 7,12,17,23,29,35,41,47,52,57,60,63,64,64,63,60,57,52,47,41,35,29,23,17,12, 7, 4, 1, 0,
 305  0, 1, 4, 7,12,17,23,29,35,41,47,52,57,60,63,64,64,63,60,57,52,47,41,35,29,23,17,12, 7, 4, 1, 0,
 306  0, 1, 4, 7,12,17,22,28,34,40,46,51,56,59,61,63,63,61,59,56,51,46,40,34,28,22,17,12, 7, 4, 1, 0,
 307  0, 1, 3, 7,11,16,21,27,33,39,44,49,53,57,59,60,60,59,57,53,49,44,39,33,27,21,16,11, 7, 3, 1, 0,
 308  0, 1, 3, 6,11,15,20,26,31,37,42,46,50,53,56,57,57,56,53,50,46,42,37,31,26,20,15,11, 6, 3, 1, 0,
 309  0, 1, 3, 6, 9,14,19,24,29,34,38,43,46,49,51,52,52,51,49,46,43,38,34,29,24,19,14, 9, 6, 3, 1, 0,
 310  0, 1, 3, 5, 9,12,17,21,26,30,35,38,42,44,46,47,47,46,44,42,38,35,30,26,21,17,12, 9, 5, 3, 1, 0,
 311  0, 1, 3, 5, 7,11,15,19,23,26,30,34,37,39,40,41,41,40,39,37,34,30,26,23,19,15,11, 7, 5, 3, 1, 0,
 312  0, 1, 2, 4, 6, 9,12,16,19,23,26,29,31,33,34,35,35,34,33,31,29,26,23,19,16,12, 9, 6, 4, 2, 1, 0,
 313  0, 1, 2, 3, 5, 8,10,13,16,19,21,24,26,27,28,29,29,28,27,26,24,21,19,16,13,10, 8, 5, 3, 2, 1, 0,
 314  0, 1, 1, 2, 4, 6, 8,10,12,15,17,19,20,21,22,23,23,22,21,20,19,17,15,12,10, 8, 6, 4, 2, 1, 1, 0,
 315  0, 0, 1, 2, 3, 5, 6, 8, 9,11,12,14,15,16,17,17,17,17,16,15,14,12,11, 9, 8, 6, 5, 3, 2, 1, 0, 0,
 316  0, 0, 1, 1, 2, 3, 4, 5, 6, 7, 9,10,11,11,12,12,12,12,11,11,10, 9, 7, 6, 5, 4, 3, 2, 1, 1, 0, 0,
 317  0, 0, 1, 1, 1, 2, 2, 3, 4, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 6, 5, 5, 4, 3, 2, 2, 1, 1, 1, 0, 0,
 318  0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0,
 319  0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
 320  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 321 //error:0.000022
 322 };
 323 static const uint8_t obmc16[256]={
 324  0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
 325  0, 0, 1, 2, 3, 4, 5, 5, 5, 5, 4, 3, 2, 1, 0, 0,
 326  0, 1, 3, 6, 8,11,13,14,14,13,11, 8, 6, 3, 1, 0,
 327  0, 2, 6,10,15,20,24,26,26,24,20,15,10, 6, 2, 0,
 328  0, 3, 8,16,23,30,35,38,38,35,30,23,16, 8, 3, 0,
 329  1, 4,11,20,30,39,46,49,49,46,39,30,20,11, 4, 1,
 330  1, 5,13,24,35,46,54,58,58,54,46,35,24,13, 5, 1,
 331  0, 5,14,26,38,49,58,63,63,58,49,38,26,14, 5, 0,
 332  0, 5,14,26,38,49,58,63,63,58,49,38,26,14, 5, 0,
 333  1, 5,13,24,35,46,54,58,58,54,46,35,24,13, 5, 1,
 334  1, 4,11,20,30,39,46,49,49,46,39,30,20,11, 4, 1,
 335  0, 3, 8,16,23,30,35,38,38,35,30,23,16, 8, 3, 0,
 336  0, 2, 6,10,15,20,24,26,26,24,20,15,10, 6, 2, 0,
 337  0, 1, 3, 6, 8,11,13,14,14,13,11, 8, 6, 3, 1, 0,
 338  0, 0, 1, 2, 3, 4, 5, 5, 5, 5, 4, 3, 2, 1, 0, 0,
 339  0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
 340 //error:0.000022
 341 };
 342 #endif
 343
 344 //linear *64
 345 static const uint8_t obmc8[64]={
 346   4, 12, 20, 28, 28, 20, 12,  4,
 347  12, 36, 60, 84, 84, 60, 36, 12,
 348  20, 60,100,140,140,100, 60, 20,
 349  28, 84,140,196,196,140, 84, 28,
 350  28, 84,140,196,196,140, 84, 28,
 351  20, 60,100,140,140,100, 60, 20,
 352  12, 36, 60, 84, 84, 60, 36, 12,
 353   4, 12, 20, 28, 28, 20, 12,  4,
 354 //error:0.000000
 355 };
 356
 357 //linear *64
 358 static const uint8_t obmc4[16]={
 359  16, 48, 48, 16,
 360  48,144,144, 48,
 361  48,144,144, 48,
 362  16, 48, 48, 16,
 363 //error:0.000000
 364 };
 365
 366 static const uint8_t *obmc_tab[4]={
 367     obmc32, obmc16, obmc8, obmc4
 368 };
 369
 370 static int scale_mv_ref[MAX_REF_FRAMES][MAX_REF_FRAMES];
 371
 372 typedef struct BlockNode{
 373     int16_t mx;
 374     int16_t my;
 375     uint8_t ref;
 376     uint8_t color[3];
 377     uint8_t type;
 378 //#define TYPE_SPLIT    1
 379 #define BLOCK_INTRA   1
 380 #define BLOCK_OPT     2
 381 //#define TYPE_NOCOLOR  4
 382     uint8_t level; //FIXME merge into type?
 383 }BlockNode;
 384
 385 static const BlockNode null_block= { //FIXME add border maybe
 386     .color= {128,128,128},
 387     .mx= 0,
 388     .my= 0,
 389     .ref= 0,
 390     .type= 0,
 391     .level= 0,
 392 };
 393
 394 #define LOG2_MB_SIZE 4
 395 #define MB_SIZE (1<<LOG2_MB_SIZE)
 396 #define ENCODER_EXTRA_BITS 4
 397
 398 typedef struct x_and_coeff{
 399     int16_t x;
 400     uint16_t coeff;
 401 } x_and_coeff;
 402
 403 typedef struct SubBand{
 404     int level;
 405     int stride;
 406     int width;
 407     int height;
 408     int qlog;                                   ///< log(qscale)/log[2^(1/6)]
 409     DWTELEM *buf;
 410     IDWTELEM *ibuf;
 411     int buf_x_offset;
 412     int buf_y_offset;
 413     int stride_line; ///< Stride measured in lines, not pixels.
 414     x_and_coeff * x_coeff;
 415     struct SubBand *parent;
 416     uint8_t state[/*7*2*/ 7 + 512][32];
 417 }SubBand;
 418
 419 typedef struct Plane{
 420     int width;
 421     int height;
 422     SubBand band[MAX_DECOMPOSITIONS][4];
 423 }Plane;
 424
 425 typedef struct SnowContext{
 426 //    MpegEncContext m; // needed for motion estimation, should not be used for anything else, the idea is to make the motion estimation eventually independent of MpegEncContext, so this will be removed then (FIXME/XXX)
 427
 428     AVCodecContext *avctx;
 429     RangeCoder c;
 430     DSPContext dsp;
 431     AVFrame new_picture;
 432     AVFrame input_picture;              ///< new_picture with the internal linesizes
 433     AVFrame current_picture;
 434     AVFrame last_picture[MAX_REF_FRAMES];
 435     AVFrame mconly_picture;
 436 //     uint8_t q_context[16];
 437     uint8_t header_state[32];
 438     uint8_t block_state[128 + 32*128];
 439     int keyframe;
 440     int always_reset;
 441     int version;
 442     int spatial_decomposition_type;
 443     int last_spatial_decomposition_type;
 444     int temporal_decomposition_type;
 445     int spatial_decomposition_count;
 446     int temporal_decomposition_count;
 447     int max_ref_frames;
 448     int ref_frames;
 449     int16_t (*ref_mvs[MAX_REF_FRAMES])[2];
 450     uint32_t *ref_scores[MAX_REF_FRAMES];
 451     DWTELEM *spatial_dwt_buffer;
 452     IDWTELEM *spatial_idwt_buffer;
 453     int colorspace_type;
 454     int chroma_h_shift;
 455     int chroma_v_shift;
 456     int spatial_scalability;
 457     int qlog;
 458     int last_qlog;
 459     int lambda;
 460     int lambda2;
 461     int pass1_rc;
 462     int mv_scale;
 463     int last_mv_scale;
 464     int qbias;
 465     int last_qbias;
 466 #define QBIAS_SHIFT 3
 467     int b_width;
 468     int b_height;
 469     int block_max_depth;
 470     int last_block_max_depth;
 471     Plane plane[MAX_PLANES];
 472     BlockNode *block;
 473 #define ME_CACHE_SIZE 1024
 474     int me_cache[ME_CACHE_SIZE];
 475     int me_cache_generation;
 476     slice_buffer sb;
 477
 478     MpegEncContext m; // needed for motion estimation, should not be used for anything else, the idea is to make the motion estimation eventually independent of MpegEncContext, so this will be removed then (FIXME/XXX)
 479 }SnowContext;
 480
 481 typedef struct {
 482     IDWTELEM *b0;
 483     IDWTELEM *b1;
 484     IDWTELEM *b2;
 485     IDWTELEM *b3;
 486     int y;
 487 } dwt_compose_t;
 488
 489 #define slice_buffer_get_line(slice_buf, line_num) ((slice_buf)->line[line_num] ? (slice_buf)->line[line_num] : slice_buffer_load_line((slice_buf), (line_num)))
 490 //#define slice_buffer_get_line(slice_buf, line_num) (slice_buffer_load_line((slice_buf), (line_num)))
 491
 492 static void iterative_me(SnowContext *s);
 493
 494 static void slice_buffer_init(slice_buffer * buf, int line_count, int max_allocated_lines, int line_width, IDWTELEM * base_buffer)
 495 {
 496     int i;
 497
 498     buf->base_buffer = base_buffer;
 499     buf->line_count = line_count;
 500     buf->line_width = line_width;
 501     buf->data_count = max_allocated_lines;
 502     buf->line = av_mallocz (sizeof(IDWTELEM *) * line_count);
 503     buf->data_stack = av_malloc (sizeof(IDWTELEM *) * max_allocated_lines);
 504
 505     for (i = 0; i < max_allocated_lines; i++)
 506     {
 507         buf->data_stack[i] = av_malloc (sizeof(IDWTELEM) * line_width);
 508     }
 509
 510     buf->data_stack_top = max_allocated_lines - 1;
 511 }
 512
 513 static IDWTELEM * slice_buffer_load_line(slice_buffer * buf, int line)
 514 {
 515     int offset;
 516     IDWTELEM * buffer;
 517
 518 //  av_log(NULL, AV_LOG_DEBUG, "Cache hit: %d\n", line);
 519
 520     assert(buf->data_stack_top >= 0);
 521 //  assert(!buf->line[line]);
 522     if (buf->line[line])
 523         return buf->line[line];
 524
 525     offset = buf->line_width * line;
 526     buffer = buf->data_stack[buf->data_stack_top];
 527     buf->data_stack_top--;
 528     buf->line[line] = buffer;
 529
 530 //  av_log(NULL, AV_LOG_DEBUG, "slice_buffer_load_line: line: %d remaining: %d\n", line, buf->data_stack_top + 1);
 531
 532     return buffer;
 533 }
 534
 535 static void slice_buffer_release(slice_buffer * buf, int line)
 536 {
 537     int offset;
 538     IDWTELEM * buffer;
 539
 540     assert(line >= 0 && line < buf->line_count);
 541     assert(buf->line[line]);
 542
 543     offset = buf->line_width * line;
 544     buffer = buf->line[line];
 545     buf->data_stack_top++;
 546     buf->data_stack[buf->data_stack_top] = buffer;
 547     buf->line[line] = NULL;
 548
 549 //  av_log(NULL, AV_LOG_DEBUG, "slice_buffer_release: line: %d remaining: %d\n", line, buf->data_stack_top + 1);
 550 }
 551
 552 static void slice_buffer_flush(slice_buffer * buf)
 553 {
 554     int i;
 555     for (i = 0; i < buf->line_count; i++)
 556     {
 557         if (buf->line[i])
 558         {
 559 //      av_log(NULL, AV_LOG_DEBUG, "slice_buffer_flush: line: %d \n", i);
 560             slice_buffer_release(buf, i);
 561         }
 562     }
 563 }
 564
 565 static void slice_buffer_destroy(slice_buffer * buf)
 566 {
 567     int i;
 568     slice_buffer_flush(buf);
 569
 570     for (i = buf->data_count - 1; i >= 0; i--)
 571     {
 572         assert(buf->data_stack[i]);
 573         av_freep(&buf->data_stack[i]);
 574     }
 575     assert(buf->data_stack);
 576     av_freep(&buf->data_stack);
 577     assert(buf->line);
 578     av_freep(&buf->line);
 579 }
 580
 581 #ifdef __sgi
 582 // Avoid a name clash on SGI IRIX
 583 #undef qexp
 584 #endif
 585 #define QEXPSHIFT (7-FRAC_BITS+8) //FIXME try to change this to 0
 586 static uint8_t qexp[QROOT];
 587
 588 static inline int mirror(int v, int m){
 589     while((unsigned)v > (unsigned)m){
 590         v=-v;
 591         if(v<0) v+= 2*m;
 592     }
 593     return v;
 594 }
 595
 596 static inline void put_symbol(RangeCoder *c, uint8_t *state, int v, int is_signed){
 597     int i;
 598
 599     if(v){
 600         const int a= FFABS(v);
 601         const int e= av_log2(a);
 602 #if 1
 603         const int el= FFMIN(e, 10);
 604         put_rac(c, state+0, 0);
 605
 606         for(i=0; i<el; i++){
 607             put_rac(c, state+1+i, 1);  //1..10
 608         }
 609         for(; i<e; i++){
 610             put_rac(c, state+1+9, 1);  //1..10
 611         }
 612         put_rac(c, state+1+FFMIN(i,9), 0);
 613
 614         for(i=e-1; i>=el; i--){
 615             put_rac(c, state+22+9, (a>>i)&1); //22..31
 616         }
 617         for(; i>=0; i--){
 618             put_rac(c, state+22+i, (a>>i)&1); //22..31
 619         }
 620
 621         if(is_signed)
 622             put_rac(c, state+11 + el, v < 0); //11..21
 623 #else
 624
 625         put_rac(c, state+0, 0);
 626         if(e<=9){
 627             for(i=0; i<e; i++){
 628                 put_rac(c, state+1+i, 1);  //1..10
 629             }
 630             put_rac(c, state+1+i, 0);
 631
 632             for(i=e-1; i>=0; i--){
 633                 put_rac(c, state+22+i, (a>>i)&1); //22..31
 634             }
 635
 636             if(is_signed)
 637                 put_rac(c, state+11 + e, v < 0); //11..21
 638         }else{
 639             for(i=0; i<e; i++){
 640                 put_rac(c, state+1+FFMIN(i,9), 1);  //1..10
 641             }
 642             put_rac(c, state+1+FFMIN(i,9), 0);
 643
 644             for(i=e-1; i>=0; i--){
 645                 put_rac(c, state+22+FFMIN(i,9), (a>>i)&1); //22..31
 646             }
 647
 648             if(is_signed)
 649                 put_rac(c, state+11 + FFMIN(e,10), v < 0); //11..21
 650         }
 651 #endif
 652     }else{
 653         put_rac(c, state+0, 1);
 654     }
 655 }
 656
 657 static inline int get_symbol(RangeCoder *c, uint8_t *state, int is_signed){
 658     if(get_rac(c, state+0))
 659         return 0;
 660     else{
 661         int i, e, a;
 662         e= 0;
 663         while(get_rac(c, state+1 + FFMIN(e,9))){ //1..10
 664             e++;
 665         }
 666
 667         a= 1;
 668         for(i=e-1; i>=0; i--){
 669             a += a + get_rac(c, state+22 + FFMIN(i,9)); //22..31
 670         }
 671
 672         if(is_signed && get_rac(c, state+11 + FFMIN(e,10))) //11..21
 673             return -a;
 674         else
 675             return a;
 676     }
 677 }
 678
 679 static inline void put_symbol2(RangeCoder *c, uint8_t *state, int v, int log2){
 680     int i;
 681     int r= log2>=0 ? 1<<log2 : 1;
 682
 683     assert(v>=0);
 684     assert(log2>=-4);
 685
 686     while(v >= r){
 687         put_rac(c, state+4+log2, 1);
 688         v -= r;
 689         log2++;
 690         if(log2>0) r+=r;
 691     }
 692     put_rac(c, state+4+log2, 0);
 693
 694     for(i=log2-1; i>=0; i--){
 695         put_rac(c, state+31-i, (v>>i)&1);
 696     }
 697 }
 698
 699 static inline int get_symbol2(RangeCoder *c, uint8_t *state, int log2){
 700     int i;
 701     int r= log2>=0 ? 1<<log2 : 1;
 702     int v=0;
 703
 704     assert(log2>=-4);
 705
 706     while(get_rac(c, state+4+log2)){
 707         v+= r;
 708         log2++;
 709         if(log2>0) r+=r;
 710     }
 711
 712     for(i=log2-1; i>=0; i--){
 713         v+= get_rac(c, state+31-i)<<i;
 714     }
 715
 716     return v;
 717 }
 718
 719 static av_always_inline void lift(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
 720     const int mirror_left= !highpass;
 721     const int mirror_right= (width&1) ^ highpass;
 722     const int w= (width>>1) - 1 + (highpass & width);
 723     int i;
 724
 725 #define LIFT(src, ref, inv) ((src) + ((inv) ? - (ref) : + (ref)))
 726     if(mirror_left){
 727         dst[0] = LIFT(src[0], ((mul*2*ref[0]+add)>>shift), inverse);
 728         dst += dst_step;
 729         src += src_step;
 730     }
 731
 732     for(i=0; i<w; i++){
 733         dst[i*dst_step] = LIFT(src[i*src_step], ((mul*(ref[i*ref_step] + ref[(i+1)*ref_step])+add)>>shift), inverse);
 734     }
 735
 736     if(mirror_right){
 737         dst[w*dst_step] = LIFT(src[w*src_step], ((mul*2*ref[w*ref_step]+add)>>shift), inverse);
 738     }
 739 }
 740
 741 static av_always_inline void inv_lift(IDWTELEM *dst, IDWTELEM *src, IDWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
 742     const int mirror_left= !highpass;
 743     const int mirror_right= (width&1) ^ highpass;
 744     const int w= (width>>1) - 1 + (highpass & width);
 745     int i;
 746
 747 #define LIFT(src, ref, inv) ((src) + ((inv) ? - (ref) : + (ref)))
 748     if(mirror_left){
 749         dst[0] = LIFT(src[0], ((mul*2*ref[0]+add)>>shift), inverse);
 750         dst += dst_step;
 751         src += src_step;
 752     }
 753
 754     for(i=0; i<w; i++){
 755         dst[i*dst_step] = LIFT(src[i*src_step], ((mul*(ref[i*ref_step] + ref[(i+1)*ref_step])+add)>>shift), inverse);
 756     }
 757
 758     if(mirror_right){
 759         dst[w*dst_step] = LIFT(src[w*src_step], ((mul*2*ref[w*ref_step]+add)>>shift), inverse);
 760     }
 761 }
 762
 763 #ifndef liftS
 764 static av_always_inline void liftS(DWTELEM *dst, DWTELEM *src, DWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
 765     const int mirror_left= !highpass;
 766     const int mirror_right= (width&1) ^ highpass;
 767     const int w= (width>>1) - 1 + (highpass & width);
 768     int i;
 769
 770     assert(shift == 4);
 771 #define LIFTS(src, ref, inv) ((inv) ? (src) + (((ref) + 4*(src))>>shift): -((-16*(src) + (ref) + add/4 + 1 + (5<<25))/(5*4) - (1<<23)))
 772     if(mirror_left){
 773         dst[0] = LIFTS(src[0], mul*2*ref[0]+add, inverse);
 774         dst += dst_step;
 775         src += src_step;
 776     }
 777
 778     for(i=0; i<w; i++){
 779         dst[i*dst_step] = LIFTS(src[i*src_step], mul*(ref[i*ref_step] + ref[(i+1)*ref_step])+add, inverse);
 780     }
 781
 782     if(mirror_right){
 783         dst[w*dst_step] = LIFTS(src[w*src_step], mul*2*ref[w*ref_step]+add, inverse);
 784     }
 785 }
 786 static av_always_inline void inv_liftS(IDWTELEM *dst, IDWTELEM *src, IDWTELEM *ref, int dst_step, int src_step, int ref_step, int width, int mul, int add, int shift, int highpass, int inverse){
 787     const int mirror_left= !highpass;
 788     const int mirror_right= (width&1) ^ highpass;
 789     const int w= (width>>1) - 1 + (highpass & width);
 790     int i;
 791
 792     assert(shift == 4);
 793 #define LIFTS(src, ref, inv) ((inv) ? (src) + (((ref) + 4*(src))>>shift): -((-16*(src) + (ref) + add/4 + 1 + (5<<25))/(5*4) - (1<<23)))
 794     if(mirror_left){
 795         dst[0] = LIFTS(src[0], mul*2*ref[0]+add, inverse);
 796         dst += dst_step;
 797         src += src_step;
 798     }
 799
 800     for(i=0; i<w; i++){
 801         dst[i*dst_step] = LIFTS(src[i*src_step], mul*(ref[i*ref_step] + ref[(i+1)*ref_step])+add, inverse);
 802     }
 803
 804     if(mirror_right){
 805         dst[w*dst_step] = LIFTS(src[w*src_step], mul*2*ref[w*ref_step]+add, inverse);
 806     }
 807 }
 808 #endif
 809
 810 static void horizontal_decompose53i(DWTELEM *b, int width){
 811     DWTELEM temp[width];
 812     const int width2= width>>1;
 813     int x;
 814     const int w2= (width+1)>>1;
 815
 816     for(x=0; x<width2; x++){
 817         temp[x   ]= b[2*x    ];
 818         temp[x+w2]= b[2*x + 1];
 819     }
 820     if(width&1)
 821         temp[x   ]= b[2*x    ];
 822 #if 0
 823     {
 824     int A1,A2,A3,A4;
 825     A2= temp[1       ];
 826     A4= temp[0       ];
 827     A1= temp[0+width2];
 828     A1 -= (A2 + A4)>>1;
 829     A4 += (A1 + 1)>>1;
 830     b[0+width2] = A1;
 831     b[0       ] = A4;
 832     for(x=1; x+1<width2; x+=2){
 833         A3= temp[x+width2];
 834         A4= temp[x+1     ];
 835         A3 -= (A2 + A4)>>1;
 836         A2 += (A1 + A3 + 2)>>2;
 837         b[x+width2] = A3;
 838         b[x       ] = A2;
 839
 840         A1= temp[x+1+width2];
 841         A2= temp[x+2       ];
 842         A1 -= (A2 + A4)>>1;
 843         A4 += (A1 + A3 + 2)>>2;
 844         b[x+1+width2] = A1;
 845         b[x+1       ] = A4;
 846     }
 847     A3= temp[width-1];
 848     A3 -= A2;
 849     A2 += (A1 + A3 + 2)>>2;
 850     b[width -1] = A3;
 851     b[width2-1] = A2;
 852     }
 853 #else
 854     lift(b+w2, temp+w2, temp, 1, 1, 1, width, -1, 0, 1, 1, 0);
 855     lift(b   , temp   , b+w2, 1, 1, 1, width,  1, 2, 2, 0, 0);
 856 #endif
 857 }
 858
 859 static void vertical_decompose53iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
 860     int i;
 861
 862     for(i=0; i<width; i++){
 863         b1[i] -= (b0[i] + b2[i])>>1;
 864     }
 865 }
 866
 867 static void vertical_decompose53iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
 868     int i;
 869
 870     for(i=0; i<width; i++){
 871         b1[i] += (b0[i] + b2[i] + 2)>>2;
 872     }
 873 }
 874
 875 static void spatial_decompose53i(DWTELEM *buffer, int width, int height, int stride){
 876     int y;
 877     DWTELEM *b0= buffer + mirror(-2-1, height-1)*stride;
 878     DWTELEM *b1= buffer + mirror(-2  , height-1)*stride;
 879
 880     for(y=-2; y<height; y+=2){
 881         DWTELEM *b2= buffer + mirror(y+1, height-1)*stride;
 882         DWTELEM *b3= buffer + mirror(y+2, height-1)*stride;
 883
 884 {START_TIMER
 885         if(y+1<(unsigned)height) horizontal_decompose53i(b2, width);
 886         if(y+2<(unsigned)height) horizontal_decompose53i(b3, width);
 887 STOP_TIMER("horizontal_decompose53i")}
 888
 889 {START_TIMER
 890         if(y+1<(unsigned)height) vertical_decompose53iH0(b1, b2, b3, width);
 891         if(y+0<(unsigned)height) vertical_decompose53iL0(b0, b1, b2, width);
 892 STOP_TIMER("vertical_decompose53i*")}
 893
 894         b0=b2;
 895         b1=b3;
 896     }
 897 }
 898
 899 static void horizontal_decompose97i(DWTELEM *b, int width){
 900     DWTELEM temp[width];
 901     const int w2= (width+1)>>1;
 902
 903     lift (temp+w2, b    +1, b      , 1, 2, 2, width,  W_AM, W_AO, W_AS, 1, 1);
 904     liftS(temp   , b      , temp+w2, 1, 2, 1, width,  W_BM, W_BO, W_BS, 0, 0);
 905     lift (b   +w2, temp+w2, temp   , 1, 1, 1, width,  W_CM, W_CO, W_CS, 1, 0);
 906     lift (b      , temp   , b   +w2, 1, 1, 1, width,  W_DM, W_DO, W_DS, 0, 0);
 907 }
 908
 909
 910 static void vertical_decompose97iH0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
 911     int i;
 912
 913     for(i=0; i<width; i++){
 914         b1[i] -= (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
 915     }
 916 }
 917
 918 static void vertical_decompose97iH1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
 919     int i;
 920
 921     for(i=0; i<width; i++){
 922         b1[i] += (W_CM*(b0[i] + b2[i])+W_CO)>>W_CS;
 923     }
 924 }
 925
 926 static void vertical_decompose97iL0(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
 927     int i;
 928
 929     for(i=0; i<width; i++){
 930 #ifdef liftS
 931         b1[i] -= (W_BM*(b0[i] + b2[i])+W_BO)>>W_BS;
 932 #else
 933         b1[i] = (16*4*b1[i] - 4*(b0[i] + b2[i]) + W_BO*5 + (5<<27)) / (5*16) - (1<<23);
 934 #endif
 935     }
 936 }
 937
 938 static void vertical_decompose97iL1(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, int width){
 939     int i;
 940
 941     for(i=0; i<width; i++){
 942         b1[i] += (W_DM*(b0[i] + b2[i])+W_DO)>>W_DS;
 943     }
 944 }
 945
 946 static void spatial_decompose97i(DWTELEM *buffer, int width, int height, int stride){
 947     int y;
 948     DWTELEM *b0= buffer + mirror(-4-1, height-1)*stride;
 949     DWTELEM *b1= buffer + mirror(-4  , height-1)*stride;
 950     DWTELEM *b2= buffer + mirror(-4+1, height-1)*stride;
 951     DWTELEM *b3= buffer + mirror(-4+2, height-1)*stride;
 952
 953     for(y=-4; y<height; y+=2){
 954         DWTELEM *b4= buffer + mirror(y+3, height-1)*stride;
 955         DWTELEM *b5= buffer + mirror(y+4, height-1)*stride;
 956
 957 {START_TIMER
 958         if(y+3<(unsigned)height) horizontal_decompose97i(b4, width);
 959         if(y+4<(unsigned)height) horizontal_decompose97i(b5, width);
 960 if(width>400){
 961 STOP_TIMER("horizontal_decompose97i")
 962 }}
 963
 964 {START_TIMER
 965         if(y+3<(unsigned)height) vertical_decompose97iH0(b3, b4, b5, width);
 966         if(y+2<(unsigned)height) vertical_decompose97iL0(b2, b3, b4, width);
 967         if(y+1<(unsigned)height) vertical_decompose97iH1(b1, b2, b3, width);
 968         if(y+0<(unsigned)height) vertical_decompose97iL1(b0, b1, b2, width);
 969
 970 if(width>400){
 971 STOP_TIMER("vertical_decompose97i")
 972 }}
 973
 974         b0=b2;
 975         b1=b3;
 976         b2=b4;
 977         b3=b5;
 978     }
 979 }
 980
 981 void ff_spatial_dwt(DWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
 982     int level;
 983
 984     for(level=0; level<decomposition_count; level++){
 985         switch(type){
 986         case DWT_97: spatial_decompose97i(buffer, width>>level, height>>level, stride<<level); break;
 987         case DWT_53: spatial_decompose53i(buffer, width>>level, height>>level, stride<<level); break;
 988         }
 989     }
 990 }
 991
 992 static void horizontal_compose53i(IDWTELEM *b, int width){
 993     IDWTELEM temp[width];
 994     const int width2= width>>1;
 995     const int w2= (width+1)>>1;
 996     int x;
 997
 998 #if 0
 999     int A1,A2,A3,A4;
1000     A2= temp[1       ];
1001     A4= temp[0       ];
1002     A1= temp[0+width2];
1003     A1 -= (A2 + A4)>>1;
1004     A4 += (A1 + 1)>>1;
1005     b[0+width2] = A1;
1006     b[0       ] = A4;
1007     for(x=1; x+1<width2; x+=2){
1008         A3= temp[x+width2];
1009         A4= temp[x+1     ];
1010         A3 -= (A2 + A4)>>1;
1011         A2 += (A1 + A3 + 2)>>2;
1012         b[x+width2] = A3;
1013         b[x       ] = A2;
1014
1015         A1= temp[x+1+width2];
1016         A2= temp[x+2       ];
1017         A1 -= (A2 + A4)>>1;
1018         A4 += (A1 + A3 + 2)>>2;
1019         b[x+1+width2] = A1;
1020         b[x+1       ] = A4;
1021     }
1022     A3= temp[width-1];
1023     A3 -= A2;
1024     A2 += (A1 + A3 + 2)>>2;
1025     b[width -1] = A3;
1026     b[width2-1] = A2;
1027 #else
1028     inv_lift(temp   , b   , b+w2, 1, 1, 1, width,  1, 2, 2, 0, 1);
1029     inv_lift(temp+w2, b+w2, temp, 1, 1, 1, width, -1, 0, 1, 1, 1);
1030 #endif
1031     for(x=0; x<width2; x++){
1032         b[2*x    ]= temp[x   ];
1033         b[2*x + 1]= temp[x+w2];
1034     }
1035     if(width&1)
1036         b[2*x    ]= temp[x   ];
1037 }
1038
1039 static void vertical_compose53iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width){
1040     int i;
1041
1042     for(i=0; i<width; i++){
1043         b1[i] += (b0[i] + b2[i])>>1;
1044     }
1045 }
1046
1047 static void vertical_compose53iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width){
1048     int i;
1049
1050     for(i=0; i<width; i++){
1051         b1[i] -= (b0[i] + b2[i] + 2)>>2;
1052     }
1053 }
1054
1055 static void spatial_compose53i_buffered_init(dwt_compose_t *cs, slice_buffer * sb, int height, int stride_line){
1056     cs->b0 = slice_buffer_get_line(sb, mirror(-1-1, height-1) * stride_line);
1057     cs->b1 = slice_buffer_get_line(sb, mirror(-1  , height-1) * stride_line);
1058     cs->y = -1;
1059 }
1060
1061 static void spatial_compose53i_init(dwt_compose_t *cs, IDWTELEM *buffer, int height, int stride){
1062     cs->b0 = buffer + mirror(-1-1, height-1)*stride;
1063     cs->b1 = buffer + mirror(-1  , height-1)*stride;
1064     cs->y = -1;
1065 }
1066
1067 static void spatial_compose53i_dy_buffered(dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line){
1068     int y= cs->y;
1069
1070     IDWTELEM *b0= cs->b0;
1071     IDWTELEM *b1= cs->b1;
1072     IDWTELEM *b2= slice_buffer_get_line(sb, mirror(y+1, height-1) * stride_line);
1073     IDWTELEM *b3= slice_buffer_get_line(sb, mirror(y+2, height-1) * stride_line);
1074
1075 {START_TIMER
1076         if(y+1<(unsigned)height) vertical_compose53iL0(b1, b2, b3, width);
1077         if(y+0<(unsigned)height) vertical_compose53iH0(b0, b1, b2, width);
1078 STOP_TIMER("vertical_compose53i*")}
1079
1080 {START_TIMER
1081         if(y-1<(unsigned)height) horizontal_compose53i(b0, width);
1082         if(y+0<(unsigned)height) horizontal_compose53i(b1, width);
1083 STOP_TIMER("horizontal_compose53i")}
1084
1085     cs->b0 = b2;
1086     cs->b1 = b3;
1087     cs->y += 2;
1088 }
1089
1090 static void spatial_compose53i_dy(dwt_compose_t *cs, IDWTELEM *buffer, int width, int height, int stride){
1091     int y= cs->y;
1092     IDWTELEM *b0= cs->b0;
1093     IDWTELEM *b1= cs->b1;
1094     IDWTELEM *b2= buffer + mirror(y+1, height-1)*stride;
1095     IDWTELEM *b3= buffer + mirror(y+2, height-1)*stride;
1096
1097 {START_TIMER
1098         if(y+1<(unsigned)height) vertical_compose53iL0(b1, b2, b3, width);
1099         if(y+0<(unsigned)height) vertical_compose53iH0(b0, b1, b2, width);
1100 STOP_TIMER("vertical_compose53i*")}
1101
1102 {START_TIMER
1103         if(y-1<(unsigned)height) horizontal_compose53i(b0, width);
1104         if(y+0<(unsigned)height) horizontal_compose53i(b1, width);
1105 STOP_TIMER("horizontal_compose53i")}
1106
1107     cs->b0 = b2;
1108     cs->b1 = b3;
1109     cs->y += 2;
1110 }
1111
1112 static void spatial_compose53i(IDWTELEM *buffer, int width, int height, int stride){
1113     dwt_compose_t cs;
1114     spatial_compose53i_init(&cs, buffer, height, stride);
1115     while(cs.y <= height)
1116         spatial_compose53i_dy(&cs, buffer, width, height, stride);
1117 }
1118
1119
1120 void ff_snow_horizontal_compose97i(IDWTELEM *b, int width){
1121     IDWTELEM temp[width];
1122     const int w2= (width+1)>>1;
1123
1124     inv_lift (temp   , b      , b   +w2, 1, 1, 1, width,  W_DM, W_DO, W_DS, 0, 1);
1125     inv_lift (temp+w2, b   +w2, temp   , 1, 1, 1, width,  W_CM, W_CO, W_CS, 1, 1);
1126     inv_liftS(b      , temp   , temp+w2, 2, 1, 1, width,  W_BM, W_BO, W_BS, 0, 1);
1127     inv_lift (b+1    , temp+w2, b      , 2, 1, 2, width,  W_AM, W_AO, W_AS, 1, 0);
1128 }
1129
1130 static void vertical_compose97iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width){
1131     int i;
1132
1133     for(i=0; i<width; i++){
1134         b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
1135     }
1136 }
1137
1138 static void vertical_compose97iH1(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width){
1139     int i;
1140
1141     for(i=0; i<width; i++){
1142         b1[i] -= (W_CM*(b0[i] + b2[i])+W_CO)>>W_CS;
1143     }
1144 }
1145
1146 static void vertical_compose97iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width){
1147     int i;
1148
1149     for(i=0; i<width; i++){
1150 #ifdef liftS
1151         b1[i] += (W_BM*(b0[i] + b2[i])+W_BO)>>W_BS;
1152 #else
1153         b1[i] += (W_BM*(b0[i] + b2[i])+4*b1[i]+W_BO)>>W_BS;
1154 #endif
1155     }
1156 }
1157
1158 static void vertical_compose97iL1(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width){
1159     int i;
1160
1161     for(i=0; i<width; i++){
1162         b1[i] -= (W_DM*(b0[i] + b2[i])+W_DO)>>W_DS;
1163     }
1164 }
1165
1166 void ff_snow_vertical_compose97i(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
1167     int i;
1168
1169     for(i=0; i<width; i++){
1170         b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
1171         b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
1172 #ifdef liftS
1173         b2[i] += (W_BM*(b1[i] + b3[i])+W_BO)>>W_BS;
1174 #else
1175         b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
1176 #endif
1177         b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
1178     }
1179 }
1180
1181 static void spatial_compose97i_buffered_init(dwt_compose_t *cs, slice_buffer * sb, int height, int stride_line){
1182     cs->b0 = slice_buffer_get_line(sb, mirror(-3-1, height-1) * stride_line);
1183     cs->b1 = slice_buffer_get_line(sb, mirror(-3  , height-1) * stride_line);
1184     cs->b2 = slice_buffer_get_line(sb, mirror(-3+1, height-1) * stride_line);
1185     cs->b3 = slice_buffer_get_line(sb, mirror(-3+2, height-1) * stride_line);
1186     cs->y = -3;
1187 }
1188
1189 static void spatial_compose97i_init(dwt_compose_t *cs, IDWTELEM *buffer, int height, int stride){
1190     cs->b0 = buffer + mirror(-3-1, height-1)*stride;
1191     cs->b1 = buffer + mirror(-3  , height-1)*stride;
1192     cs->b2 = buffer + mirror(-3+1, height-1)*stride;
1193     cs->b3 = buffer + mirror(-3+2, height-1)*stride;
1194     cs->y = -3;
1195 }
1196
1197 static void spatial_compose97i_dy_buffered(DSPContext *dsp, dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line){
1198     int y = cs->y;
1199
1200     IDWTELEM *b0= cs->b0;
1201     IDWTELEM *b1= cs->b1;
1202     IDWTELEM *b2= cs->b2;
1203     IDWTELEM *b3= cs->b3;
1204     IDWTELEM *b4= slice_buffer_get_line(sb, mirror(y + 3, height - 1) * stride_line);
1205     IDWTELEM *b5= slice_buffer_get_line(sb, mirror(y + 4, height - 1) * stride_line);
1206
1207 {START_TIMER
1208     if(y>0 && y+4<height){
1209         dsp->vertical_compose97i(b0, b1, b2, b3, b4, b5, width);
1210     }else{
1211         if(y+3<(unsigned)height) vertical_compose97iL1(b3, b4, b5, width);
1212         if(y+2<(unsigned)height) vertical_compose97iH1(b2, b3, b4, width);
1213         if(y+1<(unsigned)height) vertical_compose97iL0(b1, b2, b3, width);
1214         if(y+0<(unsigned)height) vertical_compose97iH0(b0, b1, b2, width);
1215     }
1216 if(width>400){
1217 STOP_TIMER("vertical_compose97i")}}
1218
1219 {START_TIMER
1220         if(y-1<(unsigned)height) dsp->horizontal_compose97i(b0, width);
1221         if(y+0<(unsigned)height) dsp->horizontal_compose97i(b1, width);
1222 if(width>400 && y+0<(unsigned)height){
1223 STOP_TIMER("horizontal_compose97i")}}
1224
1225     cs->b0=b2;
1226     cs->b1=b3;
1227     cs->b2=b4;
1228     cs->b3=b5;
1229     cs->y += 2;
1230 }
1231
1232 static void spatial_compose97i_dy(dwt_compose_t *cs, IDWTELEM *buffer, int width, int height, int stride){
1233     int y = cs->y;
1234     IDWTELEM *b0= cs->b0;
1235     IDWTELEM *b1= cs->b1;
1236     IDWTELEM *b2= cs->b2;
1237     IDWTELEM *b3= cs->b3;
1238     IDWTELEM *b4= buffer + mirror(y+3, height-1)*stride;
1239     IDWTELEM *b5= buffer + mirror(y+4, height-1)*stride;
1240
1241 {START_TIMER
1242         if(y+3<(unsigned)height) vertical_compose97iL1(b3, b4, b5, width);
1243         if(y+2<(unsigned)height) vertical_compose97iH1(b2, b3, b4, width);
1244         if(y+1<(unsigned)height) vertical_compose97iL0(b1, b2, b3, width);
1245         if(y+0<(unsigned)height) vertical_compose97iH0(b0, b1, b2, width);
1246 if(width>400){
1247 STOP_TIMER("vertical_compose97i")}}
1248
1249 {START_TIMER
1250         if(y-1<(unsigned)height) ff_snow_horizontal_compose97i(b0, width);
1251         if(y+0<(unsigned)height) ff_snow_horizontal_compose97i(b1, width);
1252 if(width>400 && b0 <= b2){
1253 STOP_TIMER("horizontal_compose97i")}}
1254
1255     cs->b0=b2;
1256     cs->b1=b3;
1257     cs->b2=b4;
1258     cs->b3=b5;
1259     cs->y += 2;
1260 }
1261
1262 static void spatial_compose97i(IDWTELEM *buffer, int width, int height, int stride){
1263     dwt_compose_t cs;
1264     spatial_compose97i_init(&cs, buffer, height, stride);
1265     while(cs.y <= height)
1266         spatial_compose97i_dy(&cs, buffer, width, height, stride);
1267 }
1268
1269 static void ff_spatial_idwt_buffered_init(dwt_compose_t *cs, slice_buffer * sb, int width, int height, int stride_line, int type, int decomposition_count){
1270     int level;
1271     for(level=decomposition_count-1; level>=0; level--){
1272         switch(type){
1273         case DWT_97: spatial_compose97i_buffered_init(cs+level, sb, height>>level, stride_line<<level); break;
1274         case DWT_53: spatial_compose53i_buffered_init(cs+level, sb, height>>level, stride_line<<level); break;
1275         }
1276     }
1277 }
1278
1279 static void ff_spatial_idwt_init(dwt_compose_t *cs, IDWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
1280     int level;
1281     for(level=decomposition_count-1; level>=0; level--){
1282         switch(type){
1283         case DWT_97: spatial_compose97i_init(cs+level, buffer, height>>level, stride<<level); break;
1284         case DWT_53: spatial_compose53i_init(cs+level, buffer, height>>level, stride<<level); break;
1285         }
1286     }
1287 }
1288
1289 static void ff_spatial_idwt_slice(dwt_compose_t *cs, IDWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count, int y){
1290     const int support = type==1 ? 3 : 5;
1291     int level;
1292     if(type==2) return;
1293
1294     for(level=decomposition_count-1; level>=0; level--){
1295         while(cs[level].y <= FFMIN((y>>level)+support, height>>level)){
1296             switch(type){
1297             case DWT_97: spatial_compose97i_dy(cs+level, buffer, width>>level, height>>level, stride<<level);
1298                     break;
1299             case DWT_53: spatial_compose53i_dy(cs+level, buffer, width>>level, height>>level, stride<<level);
1300                     break;
1301             }
1302         }
1303     }
1304 }
1305
1306 static void ff_spatial_idwt_buffered_slice(DSPContext *dsp, dwt_compose_t *cs, slice_buffer * slice_buf, int width, int height, int stride_line, int type, int decomposition_count, int y){
1307     const int support = type==1 ? 3 : 5;
1308     int level;
1309     if(type==2) return;
1310
1311     for(level=decomposition_count-1; level>=0; level--){
1312         while(cs[level].y <= FFMIN((y>>level)+support, height>>level)){
1313             switch(type){
1314             case DWT_97: spatial_compose97i_dy_buffered(dsp, cs+level, slice_buf, width>>level, height>>level, stride_line<<level);
1315                     break;
1316             case DWT_53: spatial_compose53i_dy_buffered(cs+level, slice_buf, width>>level, height>>level, stride_line<<level);
1317                     break;
1318             }
1319         }
1320     }
1321 }
1322
1323 static void ff_spatial_idwt(IDWTELEM *buffer, int width, int height, int stride, int type, int decomposition_count){
1324         dwt_compose_t cs[MAX_DECOMPOSITIONS];
1325         int y;
1326         ff_spatial_idwt_init(cs, buffer, width, height, stride, type, decomposition_count);
1327         for(y=0; y<height; y+=4)
1328             ff_spatial_idwt_slice(cs, buffer, width, height, stride, type, decomposition_count, y);
1329 }
1330
1331 static int encode_subband_c0run(SnowContext *s, SubBand *b, IDWTELEM *src, IDWTELEM *parent, int stride, int orientation){
1332     const int w= b->width;
1333     const int h= b->height;
1334     int x, y;
1335
1336     if(1){
1337         int run=0;
1338         int runs[w*h];
1339         int run_index=0;
1340         int max_index;
1341
1342         for(y=0; y<h; y++){
1343             for(x=0; x<w; x++){
1344                 int v, p=0;
1345                 int /*ll=0, */l=0, lt=0, t=0, rt=0;
1346                 v= src[x + y*stride];
1347
1348                 if(y){
1349                     t= src[x + (y-1)*stride];
1350                     if(x){
1351                         lt= src[x - 1 + (y-1)*stride];
1352                     }
1353                     if(x + 1 < w){
1354                         rt= src[x + 1 + (y-1)*stride];
1355                     }
1356                 }
1357                 if(x){
1358                     l= src[x - 1 + y*stride];
1359                     /*if(x > 1){
1360                         if(orientation==1) ll= src[y + (x-2)*stride];
1361                         else               ll= src[x - 2 + y*stride];
1362                     }*/
1363                 }
1364                 if(parent){
1365                     int px= x>>1;
1366                     int py= y>>1;
1367                     if(px<b->parent->width && py<b->parent->height)
1368                         p= parent[px + py*2*stride];
1369                 }
1370                 if(!(/*ll|*/l|lt|t|rt|p)){
1371                     if(v){
1372                         runs[run_index++]= run;
1373                         run=0;
1374                     }else{
1375                         run++;
1376                     }
1377                 }
1378             }
1379         }
1380         max_index= run_index;
1381         runs[run_index++]= run;
1382         run_index=0;
1383         run= runs[run_index++];
1384
1385         put_symbol2(&s->c, b->state[30], max_index, 0);
1386         if(run_index <= max_index)
1387             put_symbol2(&s->c, b->state[1], run, 3);
1388
1389         for(y=0; y<h; y++){
1390             if(s->c.bytestream_end - s->c.bytestream < w*40){
1391                 av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
1392                 return -1;
1393             }
1394             for(x=0; x<w; x++){
1395                 int v, p=0;
1396                 int /*ll=0, */l=0, lt=0, t=0, rt=0;
1397                 v= src[x + y*stride];
1398
1399                 if(y){
1400                     t= src[x + (y-1)*stride];
1401                     if(x){
1402                         lt= src[x - 1 + (y-1)*stride];
1403                     }
1404                     if(x + 1 < w){
1405                         rt= src[x + 1 + (y-1)*stride];
1406                     }
1407                 }
1408                 if(x){
1409                     l= src[x - 1 + y*stride];
1410                     /*if(x > 1){
1411                         if(orientation==1) ll= src[y + (x-2)*stride];
1412                         else               ll= src[x - 2 + y*stride];
1413                     }*/
1414                 }
1415                 if(parent){
1416                     int px= x>>1;
1417                     int py= y>>1;
1418                     if(px<b->parent->width && py<b->parent->height)
1419                         p= parent[px + py*2*stride];
1420                 }
1421                 if(/*ll|*/l|lt|t|rt|p){
1422                     int context= av_log2(/*FFABS(ll) + */3*FFABS(l) + FFABS(lt) + 2*FFABS(t) + FFABS(rt) + FFABS(p));
1423
1424                     put_rac(&s->c, &b->state[0][context], !!v);
1425                 }else{
1426                     if(!run){
1427                         run= runs[run_index++];
1428
1429                         if(run_index <= max_index)
1430                             put_symbol2(&s->c, b->state[1], run, 3);
1431                         assert(v);
1432                     }else{
1433                         run--;
1434                         assert(!v);
1435                     }
1436                 }
1437                 if(v){
1438                     int context= av_log2(/*FFABS(ll) + */3*FFABS(l) + FFABS(lt) + 2*FFABS(t) + FFABS(rt) + FFABS(p));
1439                     int l2= 2*FFABS(l) + (l<0);
1440                     int t2= 2*FFABS(t) + (t<0);
1441
1442                     put_symbol2(&s->c, b->state[context + 2], FFABS(v)-1, context-4);
1443                     put_rac(&s->c, &b->state[0][16 + 1 + 3 + quant3bA[l2&0xFF] + 3*quant3bA[t2&0xFF]], v<0);
1444                 }
1445             }
1446         }
1447     }
1448     return 0;
1449 }
1450
1451 static int encode_subband(SnowContext *s, SubBand *b, IDWTELEM *src, IDWTELEM *parent, int stride, int orientation){
1452 //    encode_subband_qtree(s, b, src, parent, stride, orientation);
1453 //    encode_subband_z0run(s, b, src, parent, stride, orientation);
1454     return encode_subband_c0run(s, b, src, parent, stride, orientation);
1455 //    encode_subband_dzr(s, b, src, parent, stride, orientation);
1456 }
1457
1458 static inline void unpack_coeffs(SnowContext *s, SubBand *b, SubBand * parent, int orientation){
1459     const int w= b->width;
1460     const int h= b->height;
1461     int x,y;
1462
1463     if(1){
1464         int run, runs;
1465         x_and_coeff *xc= b->x_coeff;
1466         x_and_coeff *prev_xc= NULL;
1467         x_and_coeff *prev2_xc= xc;
1468         x_and_coeff *parent_xc= parent ? parent->x_coeff : NULL;
1469         x_and_coeff *prev_parent_xc= parent_xc;
1470
1471         runs= get_symbol2(&s->c, b->state[30], 0);
1472         if(runs-- > 0) run= get_symbol2(&s->c, b->state[1], 3);
1473         else           run= INT_MAX;
1474
1475         for(y=0; y<h; y++){
1476             int v=0;
1477             int lt=0, t=0, rt=0;
1478
1479             if(y && prev_xc->x == 0){
1480                 rt= prev_xc->coeff;
1481             }
1482             for(x=0; x<w; x++){
1483                 int p=0;
1484                 const int l= v;
1485
1486                 lt= t; t= rt;
1487
1488                 if(y){
1489                     if(prev_xc->x <= x)
1490                         prev_xc++;
1491                     if(prev_xc->x == x + 1)
1492                         rt= prev_xc->coeff;
1493                     else
1494                         rt=0;
1495                 }
1496                 if(parent_xc){
1497                     if(x>>1 > parent_xc->x){
1498                         parent_xc++;
1499                     }
1500                     if(x>>1 == parent_xc->x){
1501                         p= parent_xc->coeff;
1502                     }
1503                 }
1504                 if(/*ll|*/l|lt|t|rt|p){
1505                     int context= av_log2(/*FFABS(ll) + */3*(l>>1) + (lt>>1) + (t&~1) + (rt>>1) + (p>>1));
1506
1507                     v=get_rac(&s->c, &b->state[0][context]);
1508                     if(v){
1509                         v= 2*(get_symbol2(&s->c, b->state[context + 2], context-4) + 1);
1510                         v+=get_rac(&s->c, &b->state[0][16 + 1 + 3 + quant3bA[l&0xFF] + 3*quant3bA[t&0xFF]]);
1511
1512                         xc->x=x;
1513                         (xc++)->coeff= v;
1514                     }
1515                 }else{
1516                     if(!run){
1517                         if(runs-- > 0) run= get_symbol2(&s->c, b->state[1], 3);
1518                         else           run= INT_MAX;
1519                         v= 2*(get_symbol2(&s->c, b->state[0 + 2], 0-4) + 1);
1520                         v+=get_rac(&s->c, &b->state[0][16 + 1 + 3]);
1521
1522                         xc->x=x;
1523                         (xc++)->coeff= v;
1524                     }else{
1525                         int max_run;
1526                         run--;
1527                         v=0;
1528
1529                         if(y) max_run= FFMIN(run, prev_xc->x - x - 2);
1530                         else  max_run= FFMIN(run, w-x-1);
1531                         if(parent_xc)
1532                             max_run= FFMIN(max_run, 2*parent_xc->x - x - 1);
1533                         x+= max_run;
1534                         run-= max_run;
1535                     }
1536                 }
1537             }
1538             (xc++)->x= w+1; //end marker
1539             prev_xc= prev2_xc;
1540             prev2_xc= xc;
1541
1542             if(parent_xc){
1543                 if(y&1){
1544                     while(parent_xc->x != parent->width+1)
1545                         parent_xc++;
1546                     parent_xc++;
1547                     prev_parent_xc= parent_xc;
1548                 }else{
1549                     parent_xc= prev_parent_xc;
1550                 }
1551             }
1552         }
1553
1554         (xc++)->x= w+1; //end marker
1555     }
1556 }
1557
1558 static inline void decode_subband_slice_buffered(SnowContext *s, SubBand *b, slice_buffer * sb, int start_y, int h, int save_state[1]){
1559     const int w= b->width;
1560     int y;
1561     const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
1562     int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
1563     int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
1564     int new_index = 0;
1565
1566     START_TIMER
1567
1568     if(b->ibuf == s->spatial_idwt_buffer || s->qlog == LOSSLESS_QLOG){
1569         qadd= 0;
1570         qmul= 1<<QEXPSHIFT;
1571     }
1572
1573     /* If we are on the second or later slice, restore our index. */
1574     if (start_y != 0)
1575         new_index = save_state[0];
1576
1577
1578     for(y=start_y; y<h; y++){
1579         int x = 0;
1580         int v;
1581         IDWTELEM * line = slice_buffer_get_line(sb, y * b->stride_line + b->buf_y_offset) + b->buf_x_offset;
1582         memset(line, 0, b->width*sizeof(IDWTELEM));
1583         v = b->x_coeff[new_index].coeff;
1584         x = b->x_coeff[new_index++].x;
1585         while(x < w)
1586         {
1587             register int t= ( (v>>1)*qmul + qadd)>>QEXPSHIFT;
1588             register int u= -(v&1);
1589             line[x] = (t^u) - u;
1590
1591             v = b->x_coeff[new_index].coeff;
1592             x = b->x_coeff[new_index++].x;
1593         }
1594     }
1595     if(w > 200 && start_y != 0/*level+1 == s->spatial_decomposition_count*/){
1596         STOP_TIMER("decode_subband")
1597     }
1598
1599     /* Save our variables for the next slice. */
1600     save_state[0] = new_index;
1601
1602     return;
1603 }
1604
1605 static void reset_contexts(SnowContext *s){ //FIXME better initial contexts
1606     int plane_index, level, orientation;
1607
1608     for(plane_index=0; plane_index<3; plane_index++){
1609         for(level=0; level<s->spatial_decomposition_count; level++){
1610             for(orientation=level ? 1:0; orientation<4; orientation++){
1611                 memset(s->plane[plane_index].band[level][orientation].state, MID_STATE, sizeof(s->plane[plane_index].band[level][orientation].state));
1612             }
1613         }
1614     }
1615     memset(s->header_state, MID_STATE, sizeof(s->header_state));
1616     memset(s->block_state, MID_STATE, sizeof(s->block_state));
1617 }
1618
1619 static int alloc_blocks(SnowContext *s){
1620     int w= -((-s->avctx->width )>>LOG2_MB_SIZE);
1621     int h= -((-s->avctx->height)>>LOG2_MB_SIZE);
1622
1623     s->b_width = w;
1624     s->b_height= h;
1625
1626     s->block= av_mallocz(w * h * sizeof(BlockNode) << (s->block_max_depth*2));
1627     return 0;
1628 }
1629
1630 static inline void copy_rac_state(RangeCoder *d, RangeCoder *s){
1631     uint8_t *bytestream= d->bytestream;
1632     uint8_t *bytestream_start= d->bytestream_start;
1633     *d= *s;
1634     d->bytestream= bytestream;
1635     d->bytestream_start= bytestream_start;
1636 }
1637
1638 //near copy & paste from dsputil, FIXME
1639 static int pix_sum(uint8_t * pix, int line_size, int w)
1640 {
1641     int s, i, j;
1642
1643     s = 0;
1644     for (i = 0; i < w; i++) {
1645         for (j = 0; j < w; j++) {
1646             s += pix[0];
1647             pix ++;
1648         }
1649         pix += line_size - w;
1650     }
1651     return s;
1652 }
1653
1654 //near copy & paste from dsputil, FIXME
1655 static int pix_norm1(uint8_t * pix, int line_size, int w)
1656 {
1657     int s, i, j;
1658     uint32_t *sq = ff_squareTbl + 256;
1659
1660     s = 0;
1661     for (i = 0; i < w; i++) {
1662         for (j = 0; j < w; j ++) {
1663             s += sq[pix[0]];
1664             pix ++;
1665         }
1666         pix += line_size - w;
1667     }
1668     return s;
1669 }
1670
1671 static inline void set_blocks(SnowContext *s, int level, int x, int y, int l, int cb, int cr, int mx, int my, int ref, int type){
1672     const int w= s->b_width << s->block_max_depth;
1673     const int rem_depth= s->block_max_depth - level;
1674     const int index= (x + y*w) << rem_depth;
1675     const int block_w= 1<<rem_depth;
1676     BlockNode block;
1677     int i,j;
1678
1679     block.color[0]= l;
1680     block.color[1]= cb;
1681     block.color[2]= cr;
1682     block.mx= mx;
1683     block.my= my;
1684     block.ref= ref;
1685     block.type= type;
1686     block.level= level;
1687
1688     for(j=0; j<block_w; j++){
1689         for(i=0; i<block_w; i++){
1690             s->block[index + i + j*w]= block;
1691         }
1692     }
1693 }
1694
1695 static inline void init_ref(MotionEstContext *c, uint8_t *src[3], uint8_t *ref[3], uint8_t *ref2[3], int x, int y, int ref_index){
1696     const int offset[3]= {
1697           y*c->  stride + x,
1698         ((y*c->uvstride + x)>>1),
1699         ((y*c->uvstride + x)>>1),
1700     };
1701     int i;
1702     for(i=0; i<3; i++){
1703         c->src[0][i]= src [i];
1704         c->ref[0][i]= ref [i] + offset[i];
1705     }
1706     assert(!ref_index);
1707 }
1708
1709 static inline void pred_mv(SnowContext *s, int *mx, int *my, int ref,
1710                            const BlockNode *left, const BlockNode *top, const BlockNode *tr){
1711     if(s->ref_frames == 1){
1712         *mx = mid_pred(left->mx, top->mx, tr->mx);
1713         *my = mid_pred(left->my, top->my, tr->my);
1714     }else{
1715         const int *scale = scale_mv_ref[ref];
1716         *mx = mid_pred((left->mx * scale[left->ref] + 128) >>8,
1717                        (top ->mx * scale[top ->ref] + 128) >>8,
1718                        (tr  ->mx * scale[tr  ->ref] + 128) >>8);
1719         *my = mid_pred((left->my * scale[left->ref] + 128) >>8,
1720                        (top ->my * scale[top ->ref] + 128) >>8,
1721                        (tr  ->my * scale[tr  ->ref] + 128) >>8);
1722     }
1723 }
1724
1725 //FIXME copy&paste
1726 #define P_LEFT P[1]
1727 #define P_TOP P[2]
1728 #define P_TOPRIGHT P[3]
1729 #define P_MEDIAN P[4]
1730 #define P_MV1 P[9]
1731 #define FLAG_QPEL   1 //must be 1
1732
1733 static int encode_q_branch(SnowContext *s, int level, int x, int y){
1734     uint8_t p_buffer[1024];
1735     uint8_t i_buffer[1024];
1736     uint8_t p_state[sizeof(s->block_state)];
1737     uint8_t i_state[sizeof(s->block_state)];
1738     RangeCoder pc, ic;
1739     uint8_t *pbbak= s->c.bytestream;
1740     uint8_t *pbbak_start= s->c.bytestream_start;
1741     int score, score2, iscore, i_len, p_len, block_s, sum, base_bits;
1742     const int w= s->b_width  << s->block_max_depth;
1743     const int h= s->b_height << s->block_max_depth;
1744     const int rem_depth= s->block_max_depth - level;
1745     const int index= (x + y*w) << rem_depth;
1746     const int block_w= 1<<(LOG2_MB_SIZE - level);
1747     int trx= (x+1)<<rem_depth;
1748     int try= (y+1)<<rem_depth;
1749     const BlockNode *left  = x ? &s->block[index-1] : &null_block;
1750     const BlockNode *top   = y ? &s->block[index-w] : &null_block;
1751     const BlockNode *right = trx<w ? &s->block[index+1] : &null_block;
1752     const BlockNode *bottom= try<h ? &s->block[index+w] : &null_block;
1753     const BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
1754     const BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
1755     int pl = left->color[0];
1756     int pcb= left->color[1];
1757     int pcr= left->color[2];
1758     int pmx, pmy;
1759     int mx=0, my=0;
1760     int l,cr,cb;
1761     const int stride= s->current_picture.linesize[0];
1762     const int uvstride= s->current_picture.linesize[1];
1763     uint8_t *current_data[3]= { s->input_picture.data[0] + (x + y*  stride)*block_w,
1764                                 s->input_picture.data[1] + (x + y*uvstride)*block_w/2,
1765                                 s->input_picture.data[2] + (x + y*uvstride)*block_w/2};
1766     int P[10][2];
1767     int16_t last_mv[3][2];
1768     int qpel= !!(s->avctx->flags & CODEC_FLAG_QPEL); //unused
1769     const int shift= 1+qpel;
1770     MotionEstContext *c= &s->m.me;
1771     int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
1772     int mx_context= av_log2(2*FFABS(left->mx - top->mx));
1773     int my_context= av_log2(2*FFABS(left->my - top->my));
1774     int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
1775     int ref, best_ref, ref_score, ref_mx, ref_my;
1776
1777     assert(sizeof(s->block_state) >= 256);
1778     if(s->keyframe){
1779         set_blocks(s, level, x, y, pl, pcb, pcr, 0, 0, 0, BLOCK_INTRA);
1780         return 0;
1781     }
1782
1783 //    clip predictors / edge ?
1784
1785     P_LEFT[0]= left->mx;
1786     P_LEFT[1]= left->my;
1787     P_TOP [0]= top->mx;
1788     P_TOP [1]= top->my;
1789     P_TOPRIGHT[0]= tr->mx;
1790     P_TOPRIGHT[1]= tr->my;
1791
1792     last_mv[0][0]= s->block[index].mx;
1793     last_mv[0][1]= s->block[index].my;
1794     last_mv[1][0]= right->mx;
1795     last_mv[1][1]= right->my;
1796     last_mv[2][0]= bottom->mx;
1797     last_mv[2][1]= bottom->my;
1798
1799     s->m.mb_stride=2;
1800     s->m.mb_x=
1801     s->m.mb_y= 0;
1802     c->skip= 0;
1803
1804     assert(c->  stride ==   stride);
1805     assert(c->uvstride == uvstride);
1806
1807     c->penalty_factor    = get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_cmp);
1808     c->sub_penalty_factor= get_penalty_factor(s->lambda, s->lambda2, c->avctx->me_sub_cmp);
1809     c->mb_penalty_factor = get_penalty_factor(s->lambda, s->lambda2, c->avctx->mb_cmp);
1810     c->current_mv_penalty= c->mv_penalty[s->m.f_code=1] + MAX_MV;
1811
1812     c->xmin = - x*block_w - 16+2;
1813     c->ymin = - y*block_w - 16+2;
1814     c->xmax = - (x+1)*block_w + (w<<(LOG2_MB_SIZE - s->block_max_depth)) + 16-2;
1815     c->ymax = - (y+1)*block_w + (h<<(LOG2_MB_SIZE - s->block_max_depth)) + 16-2;
1816
1817     if(P_LEFT[0]     > (c->xmax<<shift)) P_LEFT[0]    = (c->xmax<<shift);
1818     if(P_LEFT[1]     > (c->ymax<<shift)) P_LEFT[1]    = (c->ymax<<shift);
1819     if(P_TOP[0]      > (c->xmax<<shift)) P_TOP[0]     = (c->xmax<<shift);
1820     if(P_TOP[1]      > (c->ymax<<shift)) P_TOP[1]     = (c->ymax<<shift);
1821     if(P_TOPRIGHT[0] < (c->xmin<<shift)) P_TOPRIGHT[0]= (c->xmin<<shift);
1822     if(P_TOPRIGHT[0] > (c->xmax<<shift)) P_TOPRIGHT[0]= (c->xmax<<shift); //due to pmx no clip
1823     if(P_TOPRIGHT[1] > (c->ymax<<shift)) P_TOPRIGHT[1]= (c->ymax<<shift);
1824
1825     P_MEDIAN[0]= mid_pred(P_LEFT[0], P_TOP[0], P_TOPRIGHT[0]);
1826     P_MEDIAN[1]= mid_pred(P_LEFT[1], P_TOP[1], P_TOPRIGHT[1]);
1827
1828     if (!y) {
1829         c->pred_x= P_LEFT[0];
1830         c->pred_y= P_LEFT[1];
1831     } else {
1832         c->pred_x = P_MEDIAN[0];
1833         c->pred_y = P_MEDIAN[1];
1834     }
1835
1836     score= INT_MAX;
1837     best_ref= 0;
1838     for(ref=0; ref<s->ref_frames; ref++){
1839         init_ref(c, current_data, s->last_picture[ref].data, NULL, block_w*x, block_w*y, 0);
1840
1841         ref_score= ff_epzs_motion_search(&s->m, &ref_mx, &ref_my, P, 0, /*ref_index*/ 0, last_mv,
1842                                          (1<<16)>>shift, level-LOG2_MB_SIZE+4, block_w);
1843
1844         assert(ref_mx >= c->xmin);
1845         assert(ref_mx <= c->xmax);
1846         assert(ref_my >= c->ymin);
1847         assert(ref_my <= c->ymax);
1848
1849         ref_score= c->sub_motion_search(&s->m, &ref_mx, &ref_my, ref_score, 0, 0, level-LOG2_MB_SIZE+4, block_w);
1850         ref_score= ff_get_mb_score(&s->m, ref_mx, ref_my, 0, 0, level-LOG2_MB_SIZE+4, block_w, 0);
1851         ref_score+= 2*av_log2(2*ref)*c->penalty_factor;
1852         if(s->ref_mvs[ref]){
1853             s->ref_mvs[ref][index][0]= ref_mx;
1854             s->ref_mvs[ref][index][1]= ref_my;
1855             s->ref_scores[ref][index]= ref_score;
1856         }
1857         if(score > ref_score){
1858             score= ref_score;
1859             best_ref= ref;
1860             mx= ref_mx;
1861             my= ref_my;
1862         }
1863     }
1864     //FIXME if mb_cmp != SSE then intra cannot be compared currently and mb_penalty vs. lambda2
1865
1866   //  subpel search
1867     base_bits= get_rac_count(&s->c) - 8*(s->c.bytestream - s->c.bytestream_start);
1868     pc= s->c;
1869     pc.bytestream_start=
1870     pc.bytestream= p_buffer; //FIXME end/start? and at the other stoo
1871     memcpy(p_state, s->block_state, sizeof(s->block_state));
1872
1873     if(level!=s->block_max_depth)
1874         put_rac(&pc, &p_state[4 + s_context], 1);
1875     put_rac(&pc, &p_state[1 + left->type + top->type], 0);
1876     if(s->ref_frames > 1)
1877         put_symbol(&pc, &p_state[128 + 1024 + 32*ref_context], best_ref, 0);
1878     pred_mv(s, &pmx, &pmy, best_ref, left, top, tr);
1879     put_symbol(&pc, &p_state[128 + 32*(mx_context + 16*!!best_ref)], mx - pmx, 1);
1880     put_symbol(&pc, &p_state[128 + 32*(my_context + 16*!!best_ref)], my - pmy, 1);
1881     p_len= pc.bytestream - pc.bytestream_start;
1882     score += (s->lambda2*(get_rac_count(&pc)-base_bits))>>FF_LAMBDA_SHIFT;
1883
1884     block_s= block_w*block_w;
1885     sum = pix_sum(current_data[0], stride, block_w);
1886     l= (sum + block_s/2)/block_s;
1887     iscore = pix_norm1(current_data[0], stride, block_w) - 2*l*sum + l*l*block_s;
1888
1889     block_s= block_w*block_w>>2;
1890     sum = pix_sum(current_data[1], uvstride, block_w>>1);
1891     cb= (sum + block_s/2)/block_s;
1892 //    iscore += pix_norm1(&current_mb[1][0], uvstride, block_w>>1) - 2*cb*sum + cb*cb*block_s;
1893     sum = pix_sum(current_data[2], uvstride, block_w>>1);
1894     cr= (sum + block_s/2)/block_s;
1895 //    iscore += pix_norm1(&current_mb[2][0], uvstride, block_w>>1) - 2*cr*sum + cr*cr*block_s;
1896
1897     ic= s->c;
1898     ic.bytestream_start=
1899     ic.bytestream= i_buffer; //FIXME end/start? and at the other stoo
1900     memcpy(i_state, s->block_state, sizeof(s->block_state));
1901     if(level!=s->block_max_depth)
1902         put_rac(&ic, &i_state[4 + s_context], 1);
1903     put_rac(&ic, &i_state[1 + left->type + top->type], 1);
1904     put_symbol(&ic, &i_state[32],  l-pl , 1);
1905     put_symbol(&ic, &i_state[64], cb-pcb, 1);
1906     put_symbol(&ic, &i_state[96], cr-pcr, 1);
1907     i_len= ic.bytestream - ic.bytestream_start;
1908     iscore += (s->lambda2*(get_rac_count(&ic)-base_bits))>>FF_LAMBDA_SHIFT;
1909
1910 //    assert(score==256*256*256*64-1);
1911     assert(iscore < 255*255*256 + s->lambda2*10);
1912     assert(iscore >= 0);
1913     assert(l>=0 && l<=255);
1914     assert(pl>=0 && pl<=255);
1915
1916     if(level==0){
1917         int varc= iscore >> 8;
1918         int vard= score >> 8;
1919         if (vard <= 64 || vard < varc)
1920             c->scene_change_score+= ff_sqrt(vard) - ff_sqrt(varc);
1921         else
1922             c->scene_change_score+= s->m.qscale;
1923     }
1924
1925     if(level!=s->block_max_depth){
1926         put_rac(&s->c, &s->block_state[4 + s_context], 0);
1927         score2 = encode_q_branch(s, level+1, 2*x+0, 2*y+0);
1928         score2+= encode_q_branch(s, level+1, 2*x+1, 2*y+0);
1929         score2+= encode_q_branch(s, level+1, 2*x+0, 2*y+1);
1930         score2+= encode_q_branch(s, level+1, 2*x+1, 2*y+1);
1931         score2+= s->lambda2>>FF_LAMBDA_SHIFT; //FIXME exact split overhead
1932
1933         if(score2 < score && score2 < iscore)
1934             return score2;
1935     }
1936
1937     if(iscore < score){
1938         pred_mv(s, &pmx, &pmy, 0, left, top, tr);
1939         memcpy(pbbak, i_buffer, i_len);
1940         s->c= ic;
1941         s->c.bytestream_start= pbbak_start;
1942         s->c.bytestream= pbbak + i_len;
1943         set_blocks(s, level, x, y, l, cb, cr, pmx, pmy, 0, BLOCK_INTRA);
1944         memcpy(s->block_state, i_state, sizeof(s->block_state));
1945         return iscore;
1946     }else{
1947         memcpy(pbbak, p_buffer, p_len);
1948         s->c= pc;
1949         s->c.bytestream_start= pbbak_start;
1950         s->c.bytestream= pbbak + p_len;
1951         set_blocks(s, level, x, y, pl, pcb, pcr, mx, my, best_ref, 0);
1952         memcpy(s->block_state, p_state, sizeof(s->block_state));
1953         return score;
1954     }
1955 }
1956
1957 static av_always_inline int same_block(BlockNode *a, BlockNode *b){
1958     if((a->type&BLOCK_INTRA) && (b->type&BLOCK_INTRA)){
1959         return !((a->color[0] - b->color[0]) | (a->color[1] - b->color[1]) | (a->color[2] - b->color[2]));
1960     }else{
1961         return !((a->mx - b->mx) | (a->my - b->my) | (a->ref - b->ref) | ((a->type ^ b->type)&BLOCK_INTRA));
1962     }
1963 }
1964
1965 static void encode_q_branch2(SnowContext *s, int level, int x, int y){
1966     const int w= s->b_width  << s->block_max_depth;
1967     const int rem_depth= s->block_max_depth - level;
1968     const int index= (x + y*w) << rem_depth;
1969     int trx= (x+1)<<rem_depth;
1970     BlockNode *b= &s->block[index];
1971     const BlockNode *left  = x ? &s->block[index-1] : &null_block;
1972     const BlockNode *top   = y ? &s->block[index-w] : &null_block;
1973     const BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
1974     const BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
1975     int pl = left->color[0];
1976     int pcb= left->color[1];
1977     int pcr= left->color[2];
1978     int pmx, pmy;
1979     int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
1980     int mx_context= av_log2(2*FFABS(left->mx - top->mx)) + 16*!!b->ref;
1981     int my_context= av_log2(2*FFABS(left->my - top->my)) + 16*!!b->ref;
1982     int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
1983
1984     if(s->keyframe){
1985         set_blocks(s, level, x, y, pl, pcb, pcr, 0, 0, 0, BLOCK_INTRA);
1986         return;
1987     }
1988
1989     if(level!=s->block_max_depth){
1990         if(same_block(b,b+1) && same_block(b,b+w) && same_block(b,b+w+1)){
1991             put_rac(&s->c, &s->block_state[4 + s_context], 1);
1992         }else{
1993             put_rac(&s->c, &s->block_state[4 + s_context], 0);
1994             encode_q_branch2(s, level+1, 2*x+0, 2*y+0);
1995             encode_q_branch2(s, level+1, 2*x+1, 2*y+0);
1996             encode_q_branch2(s, level+1, 2*x+0, 2*y+1);
1997             encode_q_branch2(s, level+1, 2*x+1, 2*y+1);
1998             return;
1999         }
2000     }
2001     if(b->type & BLOCK_INTRA){
2002         pred_mv(s, &pmx, &pmy, 0, left, top, tr);
2003         put_rac(&s->c, &s->block_state[1 + (left->type&1) + (top->type&1)], 1);
2004         put_symbol(&s->c, &s->block_state[32], b->color[0]-pl , 1);
2005         put_symbol(&s->c, &s->block_state[64], b->color[1]-pcb, 1);
2006         put_symbol(&s->c, &s->block_state[96], b->color[2]-pcr, 1);
2007         set_blocks(s, level, x, y, b->color[0], b->color[1], b->color[2], pmx, pmy, 0, BLOCK_INTRA);
2008     }else{
2009         pred_mv(s, &pmx, &pmy, b->ref, left, top, tr);
2010         put_rac(&s->c, &s->block_state[1 + (left->type&1) + (top->type&1)], 0);
2011         if(s->ref_frames > 1)
2012             put_symbol(&s->c, &s->block_state[128 + 1024 + 32*ref_context], b->ref, 0);
2013         put_symbol(&s->c, &s->block_state[128 + 32*mx_context], b->mx - pmx, 1);
2014         put_symbol(&s->c, &s->block_state[128 + 32*my_context], b->my - pmy, 1);
2015         set_blocks(s, level, x, y, pl, pcb, pcr, b->mx, b->my, b->ref, 0);
2016     }
2017 }
2018
2019 static void decode_q_branch(SnowContext *s, int level, int x, int y){
2020     const int w= s->b_width << s->block_max_depth;
2021     const int rem_depth= s->block_max_depth - level;
2022     const int index= (x + y*w) << rem_depth;
2023     int trx= (x+1)<<rem_depth;
2024     const BlockNode *left  = x ? &s->block[index-1] : &null_block;
2025     const BlockNode *top   = y ? &s->block[index-w] : &null_block;
2026     const BlockNode *tl    = y && x ? &s->block[index-w-1] : left;
2027     const BlockNode *tr    = y && trx<w && ((x&1)==0 || level==0) ? &s->block[index-w+(1<<rem_depth)] : tl; //FIXME use lt
2028     int s_context= 2*left->level + 2*top->level + tl->level + tr->level;
2029
2030     if(s->keyframe){
2031         set_blocks(s, level, x, y, null_block.color[0], null_block.color[1], null_block.color[2], null_block.mx, null_block.my, null_block.ref, BLOCK_INTRA);
2032         return;
2033     }
2034
2035     if(level==s->block_max_depth || get_rac(&s->c, &s->block_state[4 + s_context])){
2036         int type, mx, my;
2037         int l = left->color[0];
2038         int cb= left->color[1];
2039         int cr= left->color[2];
2040         int ref = 0;
2041         int ref_context= av_log2(2*left->ref) + av_log2(2*top->ref);
2042         int mx_context= av_log2(2*FFABS(left->mx - top->mx)) + 0*av_log2(2*FFABS(tr->mx - top->mx));
2043         int my_context= av_log2(2*FFABS(left->my - top->my)) + 0*av_log2(2*FFABS(tr->my - top->my));
2044
2045         type= get_rac(&s->c, &s->block_state[1 + left->type + top->type]) ? BLOCK_INTRA : 0;
2046
2047         if(type){
2048             pred_mv(s, &mx, &my, 0, left, top, tr);
2049             l += get_symbol(&s->c, &s->block_state[32], 1);
2050             cb+= get_symbol(&s->c, &s->block_state[64], 1);
2051             cr+= get_symbol(&s->c, &s->block_state[96], 1);
2052         }else{
2053             if(s->ref_frames > 1)
2054                 ref= get_symbol(&s->c, &s->block_state[128 + 1024 + 32*ref_context], 0);
2055             pred_mv(s, &mx, &my, ref, left, top, tr);
2056             mx+= get_symbol(&s->c, &s->block_state[128 + 32*(mx_context + 16*!!ref)], 1);
2057             my+= get_symbol(&s->c, &s->block_state[128 + 32*(my_context + 16*!!ref)], 1);
2058         }
2059         set_blocks(s, level, x, y, l, cb, cr, mx, my, ref, type);
2060     }else{
2061         decode_q_branch(s, level+1, 2*x+0, 2*y+0);
2062         decode_q_branch(s, level+1, 2*x+1, 2*y+0);
2063         decode_q_branch(s, level+1, 2*x+0, 2*y+1);
2064         decode_q_branch(s, level+1, 2*x+1, 2*y+1);
2065     }
2066 }
2067
2068 static void encode_blocks(SnowContext *s, int search){
2069     int x, y;
2070     int w= s->b_width;
2071     int h= s->b_height;
2072
2073     if(s->avctx->me_method == ME_ITER && !s->keyframe && search)
2074         iterative_me(s);
2075
2076     for(y=0; y<h; y++){
2077         if(s->c.bytestream_end - s->c.bytestream < w*MB_SIZE*MB_SIZE*3){ //FIXME nicer limit
2078             av_log(s->avctx, AV_LOG_ERROR, "encoded frame too large\n");
2079             return;
2080         }
2081         for(x=0; x<w; x++){
2082             if(s->avctx->me_method == ME_ITER || !search)
2083                 encode_q_branch2(s, 0, x, y);
2084             else
2085                 encode_q_branch (s, 0, x, y);
2086         }
2087     }
2088 }
2089
2090 static void decode_blocks(SnowContext *s){
2091     int x, y;
2092     int w= s->b_width;
2093     int h= s->b_height;
2094
2095     for(y=0; y<h; y++){
2096         for(x=0; x<w; x++){
2097             decode_q_branch(s, 0, x, y);
2098         }
2099     }
2100 }
2101
2102 static void mc_block(uint8_t *dst, const uint8_t *src, uint8_t *tmp, int stride, int b_w, int b_h, int dx, int dy){
2103     int x, y;
2104 START_TIMER
2105     for(y=0; y < b_h+5; y++){
2106         for(x=0; x < b_w; x++){
2107             int a0= src[x    ];
2108             int a1= src[x + 1];
2109             int a2= src[x + 2];
2110             int a3= src[x + 3];
2111             int a4= src[x + 4];
2112             int a5= src[x + 5];
2113 //            int am= 9*(a1+a2) - (a0+a3);
2114             int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
2115 //            int am= 18*(a2+a3) - 2*(a1+a4);
2116 //             int aL= (-7*a0 + 105*a1 + 35*a2 - 5*a3)>>3;
2117 //             int aR= (-7*a3 + 105*a2 + 35*a1 - 5*a0)>>3;
2118
2119 //            if(b_w==16) am= 8*(a1+a2);
2120
2121             if(dx<8) am = (32*a2*( 8-dx) +    am* dx    + 128)>>8;
2122             else     am = (   am*(16-dx) + 32*a3*(dx-8) + 128)>>8;
2123
2124             /* FIXME Try increasing tmp buffer to 16 bits and not clipping here. Should give marginally better results. - Robert*/
2125             if(am&(~255)) am= ~(am>>31);
2126
2127             tmp[x] = am;
2128
2129 /*            if     (dx< 4) tmp[x + y*stride]= (16*a1*( 4-dx) +    aL* dx     + 32)>>6;
2130             else if(dx< 8) tmp[x + y*stride]= (   aL*( 8-dx) +    am*(dx- 4) + 32)>>6;
2131             else if(dx<12) tmp[x + y*stride]= (   am*(12-dx) +    aR*(dx- 8) + 32)>>6;
2132             else           tmp[x + y*stride]= (   aR*(16-dx) + 16*a2*(dx-12) + 32)>>6;*/
2133         }
2134         tmp += stride;
2135         src += stride;
2136     }
2137     tmp -= (b_h+5)*stride;
2138
2139     for(y=0; y < b_h; y++){
2140         for(x=0; x < b_w; x++){
2141             int a0= tmp[x + 0*stride];
2142             int a1= tmp[x + 1*stride];
2143             int a2= tmp[x + 2*stride];
2144             int a3= tmp[x + 3*stride];
2145             int a4= tmp[x + 4*stride];
2146             int a5= tmp[x + 5*stride];
2147             int am= 20*(a2+a3) - 5*(a1+a4) + (a0+a5);
2148 //            int am= 18*(a2+a3) - 2*(a1+a4);
2149 /*            int aL= (-7*a0 + 105*a1 + 35*a2 - 5*a3)>>3;
2150             int aR= (-7*a3 + 105*a2 + 35*a1 - 5*a0)>>3;*/
2151
2152 //            if(b_w==16) am= 8*(a1+a2);
2153
2154             if(dy<8) am =  (32*a2*( 8-dy) +    am* dy    + 128)>>8;
2155             else     am = (   am*(16-dy) + 32*a3*(dy-8) + 128)>>8;
2156
2157             if(am&(~255)) am= ~(am>>31);
2158
2159             dst[x] = am;
2160 /*            if     (dy< 4) tmp[x + y*stride]= (16*a1*( 4-dy) +    aL* dy     + 32)>>6;
2161             else if(dy< 8) tmp[x + y*stride]= (   aL*( 8-dy) +    am*(dy- 4) + 32)>>6;
2162             else if(dy<12) tmp[x + y*stride]= (   am*(12-dy) +    aR*(dy- 8) + 32)>>6;
2163             else           tmp[x + y*stride]= (   aR*(16-dy) + 16*a2*(dy-12) + 32)>>6;*/
2164         }
2165         dst += stride;
2166         tmp += stride;
2167     }
2168 STOP_TIMER("mc_block")
2169 }
2170
2171 #define mca(dx,dy,b_w)\
2172 static void mc_block_hpel ## dx ## dy ## b_w(uint8_t *dst, const uint8_t *src, int stride, int h){\
2173     uint8_t tmp[stride*(b_w+5)];\
2174     assert(h==b_w);\
2175     mc_block(dst, src-2-2*stride, tmp, stride, b_w, b_w, dx, dy);\
2176 }
2177
2178 mca( 0, 0,16)
2179 mca( 8, 0,16)
2180 mca( 0, 8,16)
2181 mca( 8, 8,16)
2182 mca( 0, 0,8)
2183 mca( 8, 0,8)
2184 mca( 0, 8,8)
2185 mca( 8, 8,8)
2186
2187 static void pred_block(SnowContext *s, uint8_t *dst, uint8_t *tmp, int stride, int sx, int sy, int b_w, int b_h, BlockNode *block, int plane_index, int w, int h){
2188     if(block->type & BLOCK_INTRA){
2189         int x, y;
2190         const int color = block->color[plane_index];
2191         const int color4= color*0x01010101;
2192         if(b_w==32){
2193             for(y=0; y < b_h; y++){
2194                 *(uint32_t*)&dst[0 + y*stride]= color4;
2195                 *(uint32_t*)&dst[4 + y*stride]= color4;
2196                 *(uint32_t*)&dst[8 + y*stride]= color4;
2197                 *(uint32_t*)&dst[12+ y*stride]= color4;
2198                 *(uint32_t*)&dst[16+ y*stride]= color4;
2199                 *(uint32_t*)&dst[20+ y*stride]= color4;
2200                 *(uint32_t*)&dst[24+ y*stride]= color4;
2201                 *(uint32_t*)&dst[28+ y*stride]= color4;
2202             }
2203         }else if(b_w==16){
2204             for(y=0; y < b_h; y++){
2205                 *(uint32_t*)&dst[0 + y*stride]= color4;
2206                 *(uint32_t*)&dst[4 + y*stride]= color4;
2207                 *(uint32_t*)&dst[8 + y*stride]= color4;
2208                 *(uint32_t*)&dst[12+ y*stride]= color4;
2209             }
2210         }else if(b_w==8){
2211             for(y=0; y < b_h; y++){
2212                 *(uint32_t*)&dst[0 + y*stride]= color4;
2213                 *(uint32_t*)&dst[4 + y*stride]= color4;
2214             }
2215         }else if(b_w==4){
2216             for(y=0; y < b_h; y++){
2217                 *(uint32_t*)&dst[0 + y*stride]= color4;
2218             }
2219         }else{
2220             for(y=0; y < b_h; y++){
2221                 for(x=0; x < b_w; x++){
2222                     dst[x + y*stride]= color;
2223                 }
2224             }
2225         }
2226     }else{
2227         uint8_t *src= s->last_picture[block->ref].data[plane_index];
2228         const int scale= plane_index ?  s->mv_scale : 2*s->mv_scale;
2229         int mx= block->mx*scale;
2230         int my= block->my*scale;
2231         const int dx= mx&15;
2232         const int dy= my&15;
2233         const int tab_index= 3 - (b_w>>2) + (b_w>>4);
2234         sx += (mx>>4) - 2;
2235         sy += (my>>4) - 2;
2236         src += sx + sy*stride;
2237         if(   (unsigned)sx >= w - b_w - 4
2238            || (unsigned)sy >= h - b_h - 4){
2239             ff_emulated_edge_mc(tmp + MB_SIZE, src, stride, b_w+5, b_h+5, sx, sy, w, h);
2240             src= tmp + MB_SIZE;
2241         }
2242 //        assert(b_w == b_h || 2*b_w == b_h || b_w == 2*b_h);
2243 //        assert(!(b_w&(b_w-1)));
2244         assert(b_w>1 && b_h>1);
2245         assert(tab_index>=0 && tab_index<4 || b_w==32);
2246         if((dx&3) || (dy&3) || !(b_w == b_h || 2*b_w == b_h || b_w == 2*b_h) || (b_w&(b_w-1)))
2247             mc_block(dst, src, tmp, stride, b_w, b_h, dx, dy);
2248         else if(b_w==32){
2249             int y;
2250             for(y=0; y<b_h; y+=16){
2251                 s->dsp.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + y*stride, src + 2 + (y+2)*stride,stride);
2252                 s->dsp.put_h264_qpel_pixels_tab[0][dy+(dx>>2)](dst + 16 + y*stride, src + 18 + (y+2)*stride,stride);
2253             }
2254         }else if(b_w==b_h)
2255             s->dsp.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst,src + 2 + 2*stride,stride);
2256         else if(b_w==2*b_h){
2257             s->dsp.put_h264_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst    ,src + 2       + 2*stride,stride);
2258             s->dsp.put_h264_qpel_pixels_tab[tab_index+1][dy+(dx>>2)](dst+b_h,src + 2 + b_h + 2*stride,stride);
2259         }else{
2260             assert(2*b_w==b_h);
2261             s->dsp.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst           ,src + 2 + 2*stride           ,stride);
2262             s->dsp.put_h264_qpel_pixels_tab[tab_index  ][dy+(dx>>2)](dst+b_w*stride,src + 2 + 2*stride+b_w*stride,stride);
2263         }
2264     }
2265 }
2266
2267 void ff_snow_inner_add_yblock(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
2268                               int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
2269     int y, x;
2270     IDWTELEM * dst;
2271     for(y=0; y<b_h; y++){
2272         //FIXME ugly misuse of obmc_stride
2273         const uint8_t *obmc1= obmc + y*obmc_stride;
2274         const uint8_t *obmc2= obmc1+ (obmc_stride>>1);
2275         const uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
2276         const uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2277         dst = slice_buffer_get_line(sb, src_y + y);
2278         for(x=0; x<b_w; x++){
2279             int v=   obmc1[x] * block[3][x + y*src_stride]
2280                     +obmc2[x] * block[2][x + y*src_stride]
2281                     +obmc3[x] * block[1][x + y*src_stride]
2282                     +obmc4[x] * block[0][x + y*src_stride];
2283
2284             v <<= 8 - LOG2_OBMC_MAX;
2285             if(FRAC_BITS != 8){
2286                 v >>= 8 - FRAC_BITS;
2287             }
2288             if(add){
2289                 v += dst[x + src_x];
2290                 v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
2291                 if(v&(~255)) v= ~(v>>31);
2292                 dst8[x + y*src_stride] = v;
2293             }else{
2294                 dst[x + src_x] -= v;
2295             }
2296         }
2297     }
2298 }
2299
2300 //FIXME name clenup (b_w, block_w, b_width stuff)
2301 static av_always_inline void add_yblock(SnowContext *s, int sliced, slice_buffer *sb, IDWTELEM *dst, uint8_t *dst8, const uint8_t *obmc, int src_x, int src_y, int b_w, int b_h, int w, int h, int dst_stride, int src_stride, int obmc_stride, int b_x, int b_y, int add, int offset_dst, int plane_index){
2302     const int b_width = s->b_width  << s->block_max_depth;
2303     const int b_height= s->b_height << s->block_max_depth;
2304     const int b_stride= b_width;
2305     BlockNode *lt= &s->block[b_x + b_y*b_stride];
2306     BlockNode *rt= lt+1;
2307     BlockNode *lb= lt+b_stride;
2308     BlockNode *rb= lb+1;
2309     uint8_t *block[4];
2310     int tmp_step= src_stride >= 7*MB_SIZE ? MB_SIZE : MB_SIZE*src_stride;
2311     uint8_t tmp[src_stride*7*MB_SIZE]; //FIXME align
2312     uint8_t *ptmp;
2313     int x,y;
2314
2315     if(b_x<0){
2316         lt= rt;
2317         lb= rb;
2318     }else if(b_x + 1 >= b_width){
2319         rt= lt;
2320         rb= lb;
2321     }
2322     if(b_y<0){
2323         lt= lb;
2324         rt= rb;
2325     }else if(b_y + 1 >= b_height){
2326         lb= lt;
2327         rb= rt;
2328     }
2329
2330     if(src_x<0){ //FIXME merge with prev & always round internal width upto *16
2331         obmc -= src_x;
2332         b_w += src_x;
2333         if(!sliced && !offset_dst)
2334             dst -= src_x;
2335         src_x=0;
2336     }else if(src_x + b_w > w){
2337         b_w = w - src_x;
2338     }
2339     if(src_y<0){
2340         obmc -= src_y*obmc_stride;
2341         b_h += src_y;
2342         if(!sliced && !offset_dst)
2343             dst -= src_y*dst_stride;
2344         src_y=0;
2345     }else if(src_y + b_h> h){
2346         b_h = h - src_y;
2347     }
2348
2349     if(b_w<=0 || b_h<=0) return;
2350
2351 assert(src_stride > 2*MB_SIZE + 5);
2352     if(!sliced && offset_dst)
2353         dst += src_x + src_y*dst_stride;
2354     dst8+= src_x + src_y*src_stride;
2355 //    src += src_x + src_y*src_stride;
2356
2357     ptmp= tmp + 3*tmp_step;
2358     block[0]= ptmp;
2359     ptmp+=tmp_step;
2360     pred_block(s, block[0], tmp, src_stride, src_x, src_y, b_w, b_h, lt, plane_index, w, h);
2361
2362     if(same_block(lt, rt)){
2363         block[1]= block[0];
2364     }else{
2365         block[1]= ptmp;
2366         ptmp+=tmp_step;
2367         pred_block(s, block[1], tmp, src_stride, src_x, src_y, b_w, b_h, rt, plane_index, w, h);
2368     }
2369
2370     if(same_block(lt, lb)){
2371         block[2]= block[0];
2372     }else if(same_block(rt, lb)){
2373         block[2]= block[1];
2374     }else{
2375         block[2]= ptmp;
2376         ptmp+=tmp_step;
2377         pred_block(s, block[2], tmp, src_stride, src_x, src_y, b_w, b_h, lb, plane_index, w, h);
2378     }
2379
2380     if(same_block(lt, rb) ){
2381         block[3]= block[0];
2382     }else if(same_block(rt, rb)){
2383         block[3]= block[1];
2384     }else if(same_block(lb, rb)){
2385         block[3]= block[2];
2386     }else{
2387         block[3]= ptmp;
2388         pred_block(s, block[3], tmp, src_stride, src_x, src_y, b_w, b_h, rb, plane_index, w, h);
2389     }
2390 #if 0
2391     for(y=0; y<b_h; y++){
2392         for(x=0; x<b_w; x++){
2393             int v=   obmc [x + y*obmc_stride] * block[3][x + y*src_stride] * (256/OBMC_MAX);
2394             if(add) dst[x + y*dst_stride] += v;
2395             else    dst[x + y*dst_stride] -= v;
2396         }
2397     }
2398     for(y=0; y<b_h; y++){
2399         uint8_t *obmc2= obmc + (obmc_stride>>1);
2400         for(x=0; x<b_w; x++){
2401             int v=   obmc2[x + y*obmc_stride] * block[2][x + y*src_stride] * (256/OBMC_MAX);
2402             if(add) dst[x + y*dst_stride] += v;
2403             else    dst[x + y*dst_stride] -= v;
2404         }
2405     }
2406     for(y=0; y<b_h; y++){
2407         uint8_t *obmc3= obmc + obmc_stride*(obmc_stride>>1);
2408         for(x=0; x<b_w; x++){
2409             int v=   obmc3[x + y*obmc_stride] * block[1][x + y*src_stride] * (256/OBMC_MAX);
2410             if(add) dst[x + y*dst_stride] += v;
2411             else    dst[x + y*dst_stride] -= v;
2412         }
2413     }
2414     for(y=0; y<b_h; y++){
2415         uint8_t *obmc3= obmc + obmc_stride*(obmc_stride>>1);
2416         uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2417         for(x=0; x<b_w; x++){
2418             int v=   obmc4[x + y*obmc_stride] * block[0][x + y*src_stride] * (256/OBMC_MAX);
2419             if(add) dst[x + y*dst_stride] += v;
2420             else    dst[x + y*dst_stride] -= v;
2421         }
2422     }
2423 #else
2424     if(sliced){
2425         START_TIMER
2426
2427         s->dsp.inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
2428         STOP_TIMER("inner_add_yblock")
2429     }else
2430     for(y=0; y<b_h; y++){
2431         //FIXME ugly misuse of obmc_stride
2432         const uint8_t *obmc1= obmc + y*obmc_stride;
2433         const uint8_t *obmc2= obmc1+ (obmc_stride>>1);
2434         const uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
2435         const uint8_t *obmc4= obmc3+ (obmc_stride>>1);
2436         for(x=0; x<b_w; x++){
2437             int v=   obmc1[x] * block[3][x + y*src_stride]
2438                     +obmc2[x] * block[2][x + y*src_stride]
2439                     +obmc3[x] * block[1][x + y*src_stride]
2440                     +obmc4[x] * block[0][x + y*src_stride];
2441
2442             v <<= 8 - LOG2_OBMC_MAX;
2443             if(FRAC_BITS != 8){
2444                 v >>= 8 - FRAC_BITS;
2445             }
2446             if(add){
2447                 v += dst[x + y*dst_stride];
2448                 v = (v + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
2449                 if(v&(~255)) v= ~(v>>31);
2450                 dst8[x + y*src_stride] = v;
2451             }else{
2452                 dst[x + y*dst_stride] -= v;
2453             }
2454         }
2455     }
2456 #endif
2457 }
2458
2459 static av_always_inline void predict_slice_buffered(SnowContext *s, slice_buffer * sb, IDWTELEM * old_buffer, int plane_index, int add, int mb_y){
2460     Plane *p= &s->plane[plane_index];
2461     const int mb_w= s->b_width  << s->block_max_depth;
2462     const int mb_h= s->b_height << s->block_max_depth;
2463     int x, y, mb_x;
2464     int block_size = MB_SIZE >> s->block_max_depth;
2465     int block_w    = plane_index ? block_size/2 : block_size;
2466     const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2467     int obmc_stride= plane_index ? block_size : 2*block_size;
2468     int ref_stride= s->current_picture.linesize[plane_index];
2469     uint8_t *dst8= s->current_picture.data[plane_index];
2470     int w= p->width;
2471     int h= p->height;
2472     START_TIMER
2473
2474     if(s->keyframe || (s->avctx->debug&512)){
2475         if(mb_y==mb_h)
2476             return;
2477
2478         if(add){
2479             for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++)
2480             {
2481 //                DWTELEM * line = slice_buffer_get_line(sb, y);
2482                 IDWTELEM * line = sb->line[y];
2483                 for(x=0; x<w; x++)
2484                 {
2485 //                    int v= buf[x + y*w] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
2486                     int v= line[x] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
2487                     v >>= FRAC_BITS;
2488                     if(v&(~255)) v= ~(v>>31);
2489                     dst8[x + y*ref_stride]= v;
2490                 }
2491             }
2492         }else{
2493             for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++)
2494             {
2495 //                DWTELEM * line = slice_buffer_get_line(sb, y);
2496                 IDWTELEM * line = sb->line[y];
2497                 for(x=0; x<w; x++)
2498                 {
2499                     line[x] -= 128 << FRAC_BITS;
2500 //                    buf[x + y*w]-= 128<<FRAC_BITS;
2501                 }
2502             }
2503         }
2504
2505         return;
2506     }
2507
2508         for(mb_x=0; mb_x<=mb_w; mb_x++){
2509             START_TIMER
2510
2511             add_yblock(s, 1, sb, old_buffer, dst8, obmc,
2512                        block_w*mb_x - block_w/2,
2513                        block_w*mb_y - block_w/2,
2514                        block_w, block_w,
2515                        w, h,
2516                        w, ref_stride, obmc_stride,
2517                        mb_x - 1, mb_y - 1,
2518                        add, 0, plane_index);
2519
2520             STOP_TIMER("add_yblock")
2521         }
2522
2523     STOP_TIMER("predict_slice")
2524 }
2525
2526 static av_always_inline void predict_slice(SnowContext *s, IDWTELEM *buf, int plane_index, int add, int mb_y){
2527     Plane *p= &s->plane[plane_index];
2528     const int mb_w= s->b_width  << s->block_max_depth;
2529     const int mb_h= s->b_height << s->block_max_depth;
2530     int x, y, mb_x;
2531     int block_size = MB_SIZE >> s->block_max_depth;
2532     int block_w    = plane_index ? block_size/2 : block_size;
2533     const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2534     const int obmc_stride= plane_index ? block_size : 2*block_size;
2535     int ref_stride= s->current_picture.linesize[plane_index];
2536     uint8_t *dst8= s->current_picture.data[plane_index];
2537     int w= p->width;
2538     int h= p->height;
2539     START_TIMER
2540
2541     if(s->keyframe || (s->avctx->debug&512)){
2542         if(mb_y==mb_h)
2543             return;
2544
2545         if(add){
2546             for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++){
2547                 for(x=0; x<w; x++){
2548                     int v= buf[x + y*w] + (128<<FRAC_BITS) + (1<<(FRAC_BITS-1));
2549                     v >>= FRAC_BITS;
2550                     if(v&(~255)) v= ~(v>>31);
2551                     dst8[x + y*ref_stride]= v;
2552                 }
2553             }
2554         }else{
2555             for(y=block_w*mb_y; y<FFMIN(h,block_w*(mb_y+1)); y++){
2556                 for(x=0; x<w; x++){
2557                     buf[x + y*w]-= 128<<FRAC_BITS;
2558                 }
2559             }
2560         }
2561
2562         return;
2563     }
2564
2565         for(mb_x=0; mb_x<=mb_w; mb_x++){
2566             START_TIMER
2567
2568             add_yblock(s, 0, NULL, buf, dst8, obmc,
2569                        block_w*mb_x - block_w/2,
2570                        block_w*mb_y - block_w/2,
2571                        block_w, block_w,
2572                        w, h,
2573                        w, ref_stride, obmc_stride,
2574                        mb_x - 1, mb_y - 1,
2575                        add, 1, plane_index);
2576
2577             STOP_TIMER("add_yblock")
2578         }
2579
2580     STOP_TIMER("predict_slice")
2581 }
2582
2583 static av_always_inline void predict_plane(SnowContext *s, IDWTELEM *buf, int plane_index, int add){
2584     const int mb_h= s->b_height << s->block_max_depth;
2585     int mb_y;
2586     for(mb_y=0; mb_y<=mb_h; mb_y++)
2587         predict_slice(s, buf, plane_index, add, mb_y);
2588 }
2589
2590 static int get_dc(SnowContext *s, int mb_x, int mb_y, int plane_index){
2591     int i, x2, y2;
2592     Plane *p= &s->plane[plane_index];
2593     const int block_size = MB_SIZE >> s->block_max_depth;
2594     const int block_w    = plane_index ? block_size/2 : block_size;
2595     const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2596     const int obmc_stride= plane_index ? block_size : 2*block_size;
2597     const int ref_stride= s->current_picture.linesize[plane_index];
2598     uint8_t *src= s-> input_picture.data[plane_index];
2599     IDWTELEM *dst= (IDWTELEM*)s->m.obmc_scratchpad + plane_index*block_size*block_size*4; //FIXME change to unsigned
2600     const int b_stride = s->b_width << s->block_max_depth;
2601     const int w= p->width;
2602     const int h= p->height;
2603     int index= mb_x + mb_y*b_stride;
2604     BlockNode *b= &s->block[index];
2605     BlockNode backup= *b;
2606     int ab=0;
2607     int aa=0;
2608
2609     b->type|= BLOCK_INTRA;
2610     b->color[plane_index]= 0;
2611     memset(dst, 0, obmc_stride*obmc_stride*sizeof(IDWTELEM));
2612
2613     for(i=0; i<4; i++){
2614         int mb_x2= mb_x + (i &1) - 1;
2615         int mb_y2= mb_y + (i>>1) - 1;
2616         int x= block_w*mb_x2 + block_w/2;
2617         int y= block_w*mb_y2 + block_w/2;
2618
2619         add_yblock(s, 0, NULL, dst + ((i&1)+(i>>1)*obmc_stride)*block_w, NULL, obmc,
2620                     x, y, block_w, block_w, w, h, obmc_stride, ref_stride, obmc_stride, mb_x2, mb_y2, 0, 0, plane_index);
2621
2622         for(y2= FFMAX(y, 0); y2<FFMIN(h, y+block_w); y2++){
2623             for(x2= FFMAX(x, 0); x2<FFMIN(w, x+block_w); x2++){
2624                 int index= x2-(block_w*mb_x - block_w/2) + (y2-(block_w*mb_y - block_w/2))*obmc_stride;
2625                 int obmc_v= obmc[index];
2626                 int d;
2627                 if(y<0) obmc_v += obmc[index + block_w*obmc_stride];
2628                 if(x<0) obmc_v += obmc[index + block_w];
2629                 if(y+block_w>h) obmc_v += obmc[index - block_w*obmc_stride];
2630                 if(x+block_w>w) obmc_v += obmc[index - block_w];
2631                 //FIXME precalc this or simplify it somehow else
2632
2633                 d = -dst[index] + (1<<(FRAC_BITS-1));
2634                 dst[index] = d;
2635                 ab += (src[x2 + y2*ref_stride] - (d>>FRAC_BITS)) * obmc_v;
2636                 aa += obmc_v * obmc_v; //FIXME precalclate this
2637             }
2638         }
2639     }
2640     *b= backup;
2641
2642     return av_clip(((ab<<LOG2_OBMC_MAX) + aa/2)/aa, 0, 255); //FIXME we should not need clipping
2643 }
2644
2645 static inline int get_block_bits(SnowContext *s, int x, int y, int w){
2646     const int b_stride = s->b_width << s->block_max_depth;
2647     const int b_height = s->b_height<< s->block_max_depth;
2648     int index= x + y*b_stride;
2649     const BlockNode *b     = &s->block[index];
2650     const BlockNode *left  = x ? &s->block[index-1] : &null_block;
2651     const BlockNode *top   = y ? &s->block[index-b_stride] : &null_block;
2652     const BlockNode *tl    = y && x ? &s->block[index-b_stride-1] : left;
2653     const BlockNode *tr    = y && x+w<b_stride ? &s->block[index-b_stride+w] : tl;
2654     int dmx, dmy;
2655 //  int mx_context= av_log2(2*FFABS(left->mx - top->mx));
2656 //  int my_context= av_log2(2*FFABS(left->my - top->my));
2657
2658     if(x<0 || x>=b_stride || y>=b_height)
2659         return 0;
2660 /*
2661 1            0      0
2662 01X          1-2    1
2663 001XX        3-6    2-3
2664 0001XXX      7-14   4-7
2665 00001XXXX   15-30   8-15
2666 */
2667 //FIXME try accurate rate
2668 //FIXME intra and inter predictors if surrounding blocks arent the same type
2669     if(b->type & BLOCK_INTRA){
2670         return 3+2*( av_log2(2*FFABS(left->color[0] - b->color[0]))
2671                    + av_log2(2*FFABS(left->color[1] - b->color[1]))
2672                    + av_log2(2*FFABS(left->color[2] - b->color[2])));
2673     }else{
2674         pred_mv(s, &dmx, &dmy, b->ref, left, top, tr);
2675         dmx-= b->mx;
2676         dmy-= b->my;
2677         return 2*(1 + av_log2(2*FFABS(dmx)) //FIXME kill the 2* can be merged in lambda
2678                     + av_log2(2*FFABS(dmy))
2679                     + av_log2(2*b->ref));
2680     }
2681 }
2682
2683 static int get_block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index, const uint8_t *obmc_edged){
2684     Plane *p= &s->plane[plane_index];
2685     const int block_size = MB_SIZE >> s->block_max_depth;
2686     const int block_w    = plane_index ? block_size/2 : block_size;
2687     const int obmc_stride= plane_index ? block_size : 2*block_size;
2688     const int ref_stride= s->current_picture.linesize[plane_index];
2689     uint8_t *dst= s->current_picture.data[plane_index];
2690     uint8_t *src= s->  input_picture.data[plane_index];
2691     IDWTELEM *pred= (IDWTELEM*)s->m.obmc_scratchpad + plane_index*block_size*block_size*4;
2692     uint8_t cur[ref_stride*2*MB_SIZE]; //FIXME alignment
2693     uint8_t tmp[ref_stride*(2*MB_SIZE+5)];
2694     const int b_stride = s->b_width << s->block_max_depth;
2695     const int b_height = s->b_height<< s->block_max_depth;
2696     const int w= p->width;
2697     const int h= p->height;
2698     int distortion;
2699     int rate= 0;
2700     const int penalty_factor= get_penalty_factor(s->lambda, s->lambda2, s->avctx->me_cmp);
2701     int sx= block_w*mb_x - block_w/2;
2702     int sy= block_w*mb_y - block_w/2;
2703     int x0= FFMAX(0,-sx);
2704     int y0= FFMAX(0,-sy);
2705     int x1= FFMIN(block_w*2, w-sx);
2706     int y1= FFMIN(block_w*2, h-sy);
2707     int i,x,y;
2708
2709     pred_block(s, cur, tmp, ref_stride, sx, sy, block_w*2, block_w*2, &s->block[mb_x + mb_y*b_stride], plane_index, w, h);
2710
2711     for(y=y0; y<y1; y++){
2712         const uint8_t *obmc1= obmc_edged + y*obmc_stride;
2713         const IDWTELEM *pred1 = pred + y*obmc_stride;
2714         uint8_t *cur1 = cur + y*ref_stride;
2715         uint8_t *dst1 = dst + sx + (sy+y)*ref_stride;
2716         for(x=x0; x<x1; x++){
2717 #if FRAC_BITS >= LOG2_OBMC_MAX
2718             int v = (cur1[x] * obmc1[x]) << (FRAC_BITS - LOG2_OBMC_MAX);
2719 #else
2720             int v = (cur1[x] * obmc1[x] + (1<<(LOG2_OBMC_MAX - FRAC_BITS-1))) >> (LOG2_OBMC_MAX - FRAC_BITS);
2721 #endif
2722             v = (v + pred1[x]) >> FRAC_BITS;
2723             if(v&(~255)) v= ~(v>>31);
2724             dst1[x] = v;
2725         }
2726     }
2727
2728     /* copy the regions where obmc[] = (uint8_t)256 */
2729     if(LOG2_OBMC_MAX == 8
2730         && (mb_x == 0 || mb_x == b_stride-1)
2731         && (mb_y == 0 || mb_y == b_height-1)){
2732         if(mb_x == 0)
2733             x1 = block_w;
2734         else
2735             x0 = block_w;
2736         if(mb_y == 0)
2737             y1 = block_w;
2738         else
2739             y0 = block_w;
2740         for(y=y0; y<y1; y++)
2741             memcpy(dst + sx+x0 + (sy+y)*ref_stride, cur + x0 + y*ref_stride, x1-x0);
2742     }
2743
2744     if(block_w==16){
2745         /* FIXME rearrange dsputil to fit 32x32 cmp functions */
2746         /* FIXME check alignment of the cmp wavelet vs the encoding wavelet */
2747         /* FIXME cmps overlap but don't cover the wavelet's whole support,
2748          * so improving the score of one block is not strictly guaranteed to
2749          * improve the score of the whole frame, so iterative motion est
2750          * doesn't always converge. */
2751         if(s->avctx->me_cmp == FF_CMP_W97)
2752             distortion = w97_32_c(&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, 32);
2753         else if(s->avctx->me_cmp == FF_CMP_W53)
2754             distortion = w53_32_c(&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, 32);
2755         else{
2756             distortion = 0;
2757             for(i=0; i<4; i++){
2758                 int off = sx+16*(i&1) + (sy+16*(i>>1))*ref_stride;
2759                 distortion += s->dsp.me_cmp[0](&s->m, src + off, dst + off, ref_stride, 16);
2760             }
2761         }
2762     }else{
2763         assert(block_w==8);
2764         distortion = s->dsp.me_cmp[0](&s->m, src + sx + sy*ref_stride, dst + sx + sy*ref_stride, ref_stride, block_w*2);
2765     }
2766
2767     if(plane_index==0){
2768         for(i=0; i<4; i++){
2769 /* ..RRr
2770  * .RXx.
2771  * rxx..
2772  */
2773             rate += get_block_bits(s, mb_x + (i&1) - (i>>1), mb_y + (i>>1), 1);
2774         }
2775         if(mb_x == b_stride-2)
2776             rate += get_block_bits(s, mb_x + 1, mb_y + 1, 1);
2777     }
2778     return distortion + rate*penalty_factor;
2779 }
2780
2781 static int get_4block_rd(SnowContext *s, int mb_x, int mb_y, int plane_index){
2782     int i, y2;
2783     Plane *p= &s->plane[plane_index];
2784     const int block_size = MB_SIZE >> s->block_max_depth;
2785     const int block_w    = plane_index ? block_size/2 : block_size;
2786     const uint8_t *obmc  = plane_index ? obmc_tab[s->block_max_depth+1] : obmc_tab[s->block_max_depth];
2787     const int obmc_stride= plane_index ? block_size : 2*block_size;
2788     const int ref_stride= s->current_picture.linesize[plane_index];
2789     uint8_t *dst= s->current_picture.data[plane_index];
2790     uint8_t *src= s-> input_picture.data[plane_index];
2791     static const IDWTELEM zero_dst[4096]; //FIXME
2792     const int b_stride = s->b_width << s->block_max_depth;
2793     const int w= p->width;
2794     const int h= p->height;
2795     int distortion= 0;
2796     int rate= 0;
2797     const int penalty_factor= get_penalty_factor(s->lambda, s->lambda2, s->avctx->me_cmp);
2798
2799     for(i=0; i<9; i++){
2800         int mb_x2= mb_x + (i%3) - 1;
2801         int mb_y2= mb_y + (i/3) - 1;
2802         int x= block_w*mb_x2 + block_w/2;
2803         int y= block_w*mb_y2 + block_w/2;
2804
2805         add_yblock(s, 0, NULL, zero_dst, dst, obmc,
2806                    x, y, block_w, block_w, w, h, /*dst_stride*/0, ref_stride, obmc_stride, mb_x2, mb_y2, 1, 1, plane_index);
2807
2808         //FIXME find a cleaner/simpler way to skip the outside stuff
2809         for(y2= y; y2<0; y2++)
2810             memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, block_w);
2811         for(y2= h; y2<y+block_w; y2++)
2812             memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, block_w);
2813         if(x<0){
2814             for(y2= y; y2<y+block_w; y2++)
2815                 memcpy(dst + x + y2*ref_stride, src + x + y2*ref_stride, -x);
2816         }
2817         if(x+block_w > w){
2818             for(y2= y; y2<y+block_w; y2++)
2819                 memcpy(dst + w + y2*ref_stride, src + w + y2*ref_stride, x+block_w - w);
2820         }
2821
2822         assert(block_w== 8 || block_w==16);
2823         distortion += s->dsp.me_cmp[block_w==8](&s->m, src + x + y*ref_stride, dst + x + y*ref_stride, ref_stride, block_w);
2824     }
2825
2826     if(plane_index==0){
2827         BlockNode *b= &s->block[mb_x+mb_y*b_stride];
2828         int merged= same_block(b,b+1) && same_block(b,b+b_stride) && same_block(b,b+b_stride+1);
2829
2830 /* ..RRRr
2831  * .RXXx.
2832  * .RXXx.
2833  * rxxx.
2834  */
2835         if(merged)
2836             rate = get_block_bits(s, mb_x, mb_y, 2);
2837         for(i=merged?4:0; i<9; i++){
2838             static const int dxy[9][2] = {{0,0},{1,0},{0,1},{1,1},{2,0},{2,1},{-1,2},{0,2},{1,2}};
2839             rate += get_block_bits(s, mb_x + dxy[i][0], mb_y + dxy[i][1], 1);
2840         }
2841     }
2842     return distortion + rate*penalty_factor;
2843 }
2844
2845 static av_always_inline int check_block(SnowContext *s, int mb_x, int mb_y, int p[3], int intra, const uint8_t *obmc_edged, int *best_rd){
2846     const int b_stride= s->b_width << s->block_max_depth;
2847     BlockNode *block= &s->block[mb_x + mb_y * b_stride];
2848     BlockNode backup= *block;
2849     int rd, index, value;
2850
2851     assert(mb_x>=0 && mb_y>=0);
2852     assert(mb_x<b_stride);
2853
2854     if(intra){
2855         block->color[0] = p[0];
2856         block->color[1] = p[1];
2857         block->color[2] = p[2];
2858         block->type |= BLOCK_INTRA;
2859     }else{
2860         index= (p[0] + 31*p[1]) & (ME_CACHE_SIZE-1);
2861         value= s->me_cache_generation + (p[0]>>10) + (p[1]<<6) + (block->ref<<12);
2862         if(s->me_cache[index] == value)
2863             return 0;
2864         s->me_cache[index]= value;
2865
2866         block->mx= p[0];
2867         block->my= p[1];
2868         block->type &= ~BLOCK_INTRA;
2869     }
2870
2871     rd= get_block_rd(s, mb_x, mb_y, 0, obmc_edged);
2872
2873 //FIXME chroma
2874     if(rd < *best_rd){
2875         *best_rd= rd;
2876         return 1;
2877     }else{
2878         *block= backup;
2879         return 0;
2880     }
2881 }
2882
2883 /* special case for int[2] args we discard afterward, fixes compilation prob with gcc 2.95 */
2884 static av_always_inline int check_block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, const uint8_t *obmc_edged, int *best_rd){
2885     int p[2] = {p0, p1};
2886     return check_block(s, mb_x, mb_y, p, 0, obmc_edged, best_rd);
2887 }
2888
2889 static av_always_inline int check_4block_inter(SnowContext *s, int mb_x, int mb_y, int p0, int p1, int ref, int *best_rd){
2890     const int b_stride= s->b_width << s->block_max_depth;
2891     BlockNode *block= &s->block[mb_x + mb_y * b_stride];
2892     BlockNode backup[4]= {block[0], block[1], block[b_stride], block[b_stride+1]};
2893     int rd, index, value;
2894
2895     assert(mb_x>=0 && mb_y>=0);
2896     assert(mb_x<b_stride);
2897     assert(((mb_x|mb_y)&1) == 0);
2898
2899     index= (p0 + 31*p1) & (ME_CACHE_SIZE-1);
2900     value= s->me_cache_generation + (p0>>10) + (p1<<6) + (block->ref<<12);
2901     if(s->me_cache[index] == value)
2902         return 0;
2903     s->me_cache[index]= value;
2904
2905     block->mx= p0;
2906     block->my= p1;
2907     block->ref= ref;
2908     block->type &= ~BLOCK_INTRA;
2909     block[1]= block[b_stride]= block[b_stride+1]= *block;
2910
2911     rd= get_4block_rd(s, mb_x, mb_y, 0);
2912
2913 //FIXME chroma
2914     if(rd < *best_rd){
2915         *best_rd= rd;
2916         return 1;
2917     }else{
2918         block[0]= backup[0];
2919         block[1]= backup[1];
2920         block[b_stride]= backup[2];
2921         block[b_stride+1]= backup[3];
2922         return 0;
2923     }
2924 }
2925
2926 static void iterative_me(SnowContext *s){
2927     int pass, mb_x, mb_y;
2928     const int b_width = s->b_width  << s->block_max_depth;
2929     const int b_height= s->b_height << s->block_max_depth;
2930     const int b_stride= b_width;
2931     int color[3];
2932
2933     {
2934         RangeCoder r = s->c;
2935         uint8_t state[sizeof(s->block_state)];
2936         memcpy(state, s->block_state, sizeof(s->block_state));
2937         for(mb_y= 0; mb_y<s->b_height; mb_y++)
2938             for(mb_x= 0; mb_x<s->b_width; mb_x++)
2939                 encode_q_branch(s, 0, mb_x, mb_y);
2940         s->c = r;
2941         memcpy(s->block_state, state, sizeof(s->block_state));
2942     }
2943
2944     for(pass=0; pass<25; pass++){
2945         int change= 0;
2946
2947         for(mb_y= 0; mb_y<b_height; mb_y++){
2948             for(mb_x= 0; mb_x<b_width; mb_x++){
2949                 int dia_change, i, j, ref;
2950                 int best_rd= INT_MAX, ref_rd;
2951                 BlockNode backup, ref_b;
2952                 const int index= mb_x + mb_y * b_stride;
2953                 BlockNode *block= &s->block[index];
2954                 BlockNode *tb =                   mb_y            ? &s->block[index-b_stride  ] : NULL;
2955                 BlockNode *lb = mb_x                              ? &s->block[index         -1] : NULL;
2956                 BlockNode *rb = mb_x+1<b_width                    ? &s->block[index         +1] : NULL;
2957                 BlockNode *bb =                   mb_y+1<b_height ? &s->block[index+b_stride  ] : NULL;
2958                 BlockNode *tlb= mb_x           && mb_y            ? &s->block[index-b_stride-1] : NULL;
2959                 BlockNode *trb= mb_x+1<b_width && mb_y            ? &s->block[index-b_stride+1] : NULL;
2960                 BlockNode *blb= mb_x           && mb_y+1<b_height ? &s->block[index+b_stride-1] : NULL;
2961                 BlockNode *brb= mb_x+1<b_width && mb_y+1<b_height ? &s->block[index+b_stride+1] : NULL;
2962                 const int b_w= (MB_SIZE >> s->block_max_depth);
2963                 uint8_t obmc_edged[b_w*2][b_w*2];
2964
2965                 if(pass && (block->type & BLOCK_OPT))
2966                     continue;
2967                 block->type |= BLOCK_OPT;
2968
2969                 backup= *block;
2970
2971                 if(!s->me_cache_generation)
2972                     memset(s->me_cache, 0, sizeof(s->me_cache));
2973                 s->me_cache_generation += 1<<22;
2974
2975                 //FIXME precalc
2976                 {
2977                     int x, y;
2978                     memcpy(obmc_edged, obmc_tab[s->block_max_depth], b_w*b_w*4);
2979                     if(mb_x==0)
2980                         for(y=0; y<b_w*2; y++)
2981                             memset(obmc_edged[y], obmc_edged[y][0] + obmc_edged[y][b_w-1], b_w);
2982                     if(mb_x==b_stride-1)
2983                         for(y=0; y<b_w*2; y++)
2984                             memset(obmc_edged[y]+b_w, obmc_edged[y][b_w] + obmc_edged[y][b_w*2-1], b_w);
2985                     if(mb_y==0){
2986                         for(x=0; x<b_w*2; x++)
2987                             obmc_edged[0][x] += obmc_edged[b_w-1][x];
2988                         for(y=1; y<b_w; y++)
2989                             memcpy(obmc_edged[y], obmc_edged[0], b_w*2);
2990                     }
2991                     if(mb_y==b_height-1){
2992                         for(x=0; x<b_w*2; x++)
2993                             obmc_edged[b_w*2-1][x] += obmc_edged[b_w][x];
2994                         for(y=b_w; y<b_w*2-1; y++)
2995                             memcpy(obmc_edged[y], obmc_edged[b_w*2-1], b_w*2);
2996                     }
2997                 }
2998
2999                 //skip stuff outside the picture
3000                 if(mb_x==0 || mb_y==0 || mb_x==b_width-1 || mb_y==b_height-1)
3001                 {
3002                     uint8_t *src= s->  input_picture.data[0];
3003                     uint8_t *dst= s->current_picture.data[0];
3004                     const int stride= s->current_picture.linesize[0];
3005                     const int block_w= MB_SIZE >> s->block_max_depth;
3006                     const int sx= block_w*mb_x - block_w/2;
3007                     const int sy= block_w*mb_y - block_w/2;
3008                     const int w= s->plane[0].width;
3009                     const int h= s->plane[0].height;
3010                     int y;
3011
3012                     for(y=sy; y<0; y++)
3013                         memcpy(dst + sx + y*stride, src + sx + y*stride, block_w*2);
3014                     for(y=h; y<sy+block_w*2; y++)
3015                         memcpy(dst + sx + y*stride, src + sx + y*stride, block_w*2);
3016                     if(sx<0){
3017                         for(y=sy; y<sy+block_w*2; y++)
3018                             memcpy(dst + sx + y*stride, src + sx + y*stride, -sx);
3019                     }
3020                     if(sx+block_w*2 > w){
3021                         for(y=sy; y<sy+block_w*2; y++)
3022                             memcpy(dst + w + y*stride, src + w + y*stride, sx+block_w*2 - w);
3023                     }
3024                 }
3025
3026                 // intra(black) = neighbors' contribution to the current block
3027                 for(i=0; i<3; i++)
3028                     color[i]= get_dc(s, mb_x, mb_y, i);
3029
3030                 // get previous score (cannot be cached due to OBMC)
3031                 if(pass > 0 && (block->type&BLOCK_INTRA)){
3032                     int color0[3]= {block->color[0], block->color[1], block->color[2]};
3033                     check_block(s, mb_x, mb_y, color0, 1, *obmc_edged, &best_rd);
3034                 }else
3035                     check_block_inter(s, mb_x, mb_y, block->mx, block->my, *obmc_edged, &best_rd);
3036
3037                 ref_b= *block;
3038                 ref_rd= best_rd;
3039                 for(ref=0; ref < s->ref_frames; ref++){
3040                     int16_t (*mvr)[2]= &s->ref_mvs[ref][index];
3041                     if(s->ref_scores[ref][index] > s->ref_scores[ref_b.ref][index]*3/2) //FIXME tune threshold
3042                         continue;
3043                     block->ref= ref;
3044                     best_rd= INT_MAX;
3045
3046                     check_block_inter(s, mb_x, mb_y, mvr[0][0], mvr[0][1], *obmc_edged, &best_rd);
3047                     check_block_inter(s, mb_x, mb_y, 0, 0, *obmc_edged, &best_rd);
3048                     if(tb)
3049                         check_block_inter(s, mb_x, mb_y, mvr[-b_stride][0], mvr[-b_stride][1], *obmc_edged, &best_rd);
3050                     if(lb)
3051                         check_block_inter(s, mb_x, mb_y, mvr[-1][0], mvr[-1][1], *obmc_edged, &best_rd);
3052                     if(rb)
3053                         check_block_inter(s, mb_x, mb_y, mvr[1][0], mvr[1][1], *obmc_edged, &best_rd);
3054                     if(bb)
3055                         check_block_inter(s, mb_x, mb_y, mvr[b_stride][0], mvr[b_stride][1], *obmc_edged, &best_rd);
3056
3057                     /* fullpel ME */
3058                     //FIXME avoid subpel interpol / round to nearest integer
3059                     do{
3060                         dia_change=0;
3061                         for(i=0; i<FFMAX(s->avctx->dia_size, 1); i++){
3062                             for(j=0; j<i; j++){
3063                                 dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+4*(i-j), block->my+(4*j), *obmc_edged, &best_rd);
3064                                 dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my-(4*j), *obmc_edged, &best_rd);
3065                                 dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+4*(i-j), block->my-(4*j), *obmc_edged, &best_rd);
3066                                 dia_change |= check_block_inter(s, mb_x, mb_y, block->mx-4*(i-j), block->my+(4*j), *obmc_edged, &best_rd);
3067                             }
3068                         }
3069                     }while(dia_change);
3070                     /* subpel ME */
3071                     do{
3072                         static const int square[8][2]= {{+1, 0},{-1, 0},{ 0,+1},{ 0,-1},{+1,+1},{-1,-1},{+1,-1},{-1,+1},};
3073                         dia_change=0;
3074                         for(i=0; i<8; i++)
3075                             dia_change |= check_block_inter(s, mb_x, mb_y, block->mx+square[i][0], block->my+square[i][1], *obmc_edged, &best_rd);
3076                     }while(dia_change);
3077                     //FIXME or try the standard 2 pass qpel or similar
3078
3079                     mvr[0][0]= block->mx;
3080                     mvr[0][1]= block->my;
3081                     if(ref_rd > best_rd){
3082                         ref_rd= best_rd;
3083                         ref_b= *block;
3084                     }
3085                 }
3086                 best_rd= ref_rd;
3087                 *block= ref_b;
3088 #if 1
3089                 check_block(s, mb_x, mb_y, color, 1, *obmc_edged, &best_rd);
3090                 //FIXME RD style color selection
3091 #endif
3092                 if(!same_block(block, &backup)){
3093                     if(tb ) tb ->type &= ~BLOCK_OPT;
3094                     if(lb ) lb ->type &= ~BLOCK_OPT;
3095                     if(rb ) rb ->type &= ~BLOCK_OPT;
3096                     if(bb ) bb ->type &= ~BLOCK_OPT;
3097                     if(tlb) tlb->type &= ~BLOCK_OPT;
3098                     if(trb) trb->type &= ~BLOCK_OPT;
3099                     if(blb) blb->type &= ~BLOCK_OPT;
3100                     if(brb) brb->type &= ~BLOCK_OPT;
3101                     change ++;
3102                 }
3103             }
3104         }
3105         av_log(NULL, AV_LOG_ERROR, "pass:%d changed:%d\n", pass, change);
3106         if(!change)
3107             break;
3108     }
3109
3110     if(s->block_max_depth == 1){
3111         int change= 0;
3112         for(mb_y= 0; mb_y<b_height; mb_y+=2){
3113             for(mb_x= 0; mb_x<b_width; mb_x+=2){
3114                 int i;
3115                 int best_rd, init_rd;
3116                 const int index= mb_x + mb_y * b_stride;
3117                 BlockNode *b[4];
3118
3119                 b[0]= &s->block[index];
3120                 b[1]= b[0]+1;
3121                 b[2]= b[0]+b_stride;
3122                 b[3]= b[2]+1;
3123                 if(same_block(b[0], b[1]) &&
3124                    same_block(b[0], b[2]) &&
3125                    same_block(b[0], b[3]))
3126                     continue;
3127
3128                 if(!s->me_cache_generation)
3129                     memset(s->me_cache, 0, sizeof(s->me_cache));
3130                 s->me_cache_generation += 1<<22;
3131
3132                 init_rd= best_rd= get_4block_rd(s, mb_x, mb_y, 0);
3133
3134                 //FIXME more multiref search?
3135                 check_4block_inter(s, mb_x, mb_y,
3136                                    (b[0]->mx + b[1]->mx + b[2]->mx + b[3]->mx + 2) >> 2,
3137                                    (b[0]->my + b[1]->my + b[2]->my + b[3]->my + 2) >> 2, 0, &best_rd);
3138
3139                 for(i=0; i<4; i++)
3140                     if(!(b[i]->type&BLOCK_INTRA))
3141                         check_4block_inter(s, mb_x, mb_y, b[i]->mx, b[i]->my, b[i]->ref, &best_rd);
3142
3143                 if(init_rd != best_rd)
3144                     change++;
3145             }
3146         }
3147         av_log(NULL, AV_LOG_ERROR, "pass:4mv changed:%d\n", change*4);
3148     }
3149 }
3150
3151 static void quantize(SnowContext *s, SubBand *b, IDWTELEM *dst, DWTELEM *src, int stride, int bias){
3152     const int level= b->level;
3153     const int w= b->width;
3154     const int h= b->height;
3155     const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
3156     const int qmul= qexp[qlog&(QROOT-1)]<<((qlog>>QSHIFT) + ENCODER_EXTRA_BITS);
3157     int x,y, thres1, thres2;
3158 //    START_TIMER
3159
3160     if(s->qlog == LOSSLESS_QLOG){
3161         for(y=0; y<h; y++)
3162             for(x=0; x<w; x++)
3163                 dst[x + y*stride]= src[x + y*stride];
3164         return;
3165     }
3166
3167     bias= bias ? 0 : (3*qmul)>>3;
3168     thres1= ((qmul - bias)>>QEXPSHIFT) - 1;
3169     thres2= 2*thres1;
3170
3171     if(!bias){
3172         for(y=0; y<h; y++){
3173             for(x=0; x<w; x++){
3174                 int i= src[x + y*stride];
3175
3176                 if((unsigned)(i+thres1) > thres2){
3177                     if(i>=0){
3178                         i<<= QEXPSHIFT;
3179                         i/= qmul; //FIXME optimize
3180                         dst[x + y*stride]=  i;
3181                     }else{
3182                         i= -i;
3183                         i<<= QEXPSHIFT;
3184                         i/= qmul; //FIXME optimize
3185                         dst[x + y*stride]= -i;
3186                     }
3187                 }else
3188                     dst[x + y*stride]= 0;
3189             }
3190         }
3191     }else{
3192         for(y=0; y<h; y++){
3193             for(x=0; x<w; x++){
3194                 int i= src[x + y*stride];
3195
3196                 if((unsigned)(i+thres1) > thres2){
3197                     if(i>=0){
3198                         i<<= QEXPSHIFT;
3199                         i= (i + bias) / qmul; //FIXME optimize
3200                         dst[x + y*stride]=  i;
3201                     }else{
3202                         i= -i;
3203                         i<<= QEXPSHIFT;
3204                         i= (i + bias) / qmul; //FIXME optimize
3205                         dst[x + y*stride]= -i;
3206                     }
3207                 }else
3208                     dst[x + y*stride]= 0;
3209             }
3210         }
3211     }
3212     if(level+1 == s->spatial_decomposition_count){
3213 //        STOP_TIMER("quantize")
3214     }
3215 }
3216
3217 static void dequantize_slice_buffered(SnowContext *s, slice_buffer * sb, SubBand *b, IDWTELEM *src, int stride, int start_y, int end_y){
3218     const int w= b->width;
3219     const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
3220     const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3221     const int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
3222     int x,y;
3223     START_TIMER
3224
3225     if(s->qlog == LOSSLESS_QLOG) return;
3226
3227     for(y=start_y; y<end_y; y++){
3228 //        DWTELEM * line = slice_buffer_get_line_from_address(sb, src + (y * stride));
3229         IDWTELEM * line = slice_buffer_get_line(sb, (y * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
3230         for(x=0; x<w; x++){
3231             int i= line[x];
3232             if(i<0){
3233                 line[x]= -((-i*qmul + qadd)>>(QEXPSHIFT)); //FIXME try different bias
3234             }else if(i>0){
3235                 line[x]=  (( i*qmul + qadd)>>(QEXPSHIFT));
3236             }
3237         }
3238     }
3239     if(w > 200 /*level+1 == s->spatial_decomposition_count*/){
3240         STOP_TIMER("dquant")
3241     }
3242 }
3243
3244 static void dequantize(SnowContext *s, SubBand *b, IDWTELEM *src, int stride){
3245     const int w= b->width;
3246     const int h= b->height;
3247     const int qlog= av_clip(s->qlog + b->qlog, 0, QROOT*16);
3248     const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3249     const int qadd= (s->qbias*qmul)>>QBIAS_SHIFT;
3250     int x,y;
3251     START_TIMER
3252
3253     if(s->qlog == LOSSLESS_QLOG) return;
3254
3255     for(y=0; y<h; y++){
3256         for(x=0; x<w; x++){
3257             int i= src[x + y*stride];
3258             if(i<0){
3259                 src[x + y*stride]= -((-i*qmul + qadd)>>(QEXPSHIFT)); //FIXME try different bias
3260             }else if(i>0){
3261                 src[x + y*stride]=  (( i*qmul + qadd)>>(QEXPSHIFT));
3262             }
3263         }
3264     }
3265     if(w > 200 /*level+1 == s->spatial_decomposition_count*/){
3266         STOP_TIMER("dquant")
3267     }
3268 }
3269
3270 static void decorrelate(SnowContext *s, SubBand *b, IDWTELEM *src, int stride, int inverse, int use_median){
3271     const int w= b->width;
3272     const int h= b->height;
3273     int x,y;
3274
3275     for(y=h-1; y>=0; y--){
3276         for(x=w-1; x>=0; x--){
3277             int i= x + y*stride;
3278
3279             if(x){
3280                 if(use_median){
3281                     if(y && x+1<w) src[i] -= mid_pred(src[i - 1], src[i - stride], src[i - stride + 1]);
3282                     else  src[i] -= src[i - 1];
3283                 }else{
3284                     if(y) src[i] -= mid_pred(src[i - 1], src[i - stride], src[i - 1] + src[i - stride] - src[i - 1 - stride]);
3285                     else  src[i] -= src[i - 1];
3286                 }
3287             }else{
3288                 if(y) src[i] -= src[i - stride];
3289             }
3290         }
3291     }
3292 }
3293
3294 static void correlate_slice_buffered(SnowContext *s, slice_buffer * sb, SubBand *b, IDWTELEM *src, int stride, int inverse, int use_median, int start_y, int end_y){
3295     const int w= b->width;
3296     int x,y;
3297
3298 //    START_TIMER
3299
3300     IDWTELEM * line=0; // silence silly "could be used without having been initialized" warning
3301     IDWTELEM * prev;
3302
3303     if (start_y != 0)
3304         line = slice_buffer_get_line(sb, ((start_y - 1) * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
3305
3306     for(y=start_y; y<end_y; y++){
3307         prev = line;
3308 //        line = slice_buffer_get_line_from_address(sb, src + (y * stride));
3309         line = slice_buffer_get_line(sb, (y * b->stride_line) + b->buf_y_offset) + b->buf_x_offset;
3310         for(x=0; x<w; x++){
3311             if(x){
3312                 if(use_median){
3313                     if(y && x+1<w) line[x] += mid_pred(line[x - 1], prev[x], prev[x + 1]);
3314                     else  line[x] += line[x - 1];
3315                 }else{
3316                     if(y) line[x] += mid_pred(line[x - 1], prev[x], line[x - 1] + prev[x] - prev[x - 1]);
3317                     else  line[x] += line[x - 1];
3318                 }
3319             }else{
3320                 if(y) line[x] += prev[x];
3321             }
3322         }
3323     }
3324
3325 //    STOP_TIMER("correlate")
3326 }
3327
3328 static void correlate(SnowContext *s, SubBand *b, IDWTELEM *src, int stride, int inverse, int use_median){
3329     const int w= b->width;
3330     const int h= b->height;
3331     int x,y;
3332
3333     for(y=0; y<h; y++){
3334         for(x=0; x<w; x++){
3335             int i= x + y*stride;
3336
3337             if(x){
3338                 if(use_median){
3339                     if(y && x+1<w) src[i] += mid_pred(src[i - 1], src[i - stride], src[i - stride + 1]);
3340                     else  src[i] += src[i - 1];
3341                 }else{
3342                     if(y) src[i] += mid_pred(src[i - 1], src[i - stride], src[i - 1] + src[i - stride] - src[i - 1 - stride]);
3343                     else  src[i] += src[i - 1];
3344                 }
3345             }else{
3346                 if(y) src[i] += src[i - stride];
3347             }
3348         }
3349     }
3350 }
3351
3352 static void encode_header(SnowContext *s){
3353     int plane_index, level, orientation;
3354     uint8_t kstate[32];
3355
3356     memset(kstate, MID_STATE, sizeof(kstate));
3357
3358     put_rac(&s->c, kstate, s->keyframe);
3359     if(s->keyframe || s->always_reset){
3360         reset_contexts(s);
3361         s->last_spatial_decomposition_type=
3362         s->last_qlog=
3363         s->last_qbias=
3364         s->last_mv_scale=
3365         s->last_block_max_depth= 0;
3366     }
3367     if(s->keyframe){
3368         put_symbol(&s->c, s->header_state, s->version, 0);
3369         put_rac(&s->c, s->header_state, s->always_reset);
3370         put_symbol(&s->c, s->header_state, s->temporal_decomposition_type, 0);
3371         put_symbol(&s->c, s->header_state, s->temporal_decomposition_count, 0);
3372         put_symbol(&s->c, s->header_state, s->spatial_decomposition_count, 0);
3373         put_symbol(&s->c, s->header_state, s->colorspace_type, 0);
3374         put_symbol(&s->c, s->header_state, s->chroma_h_shift, 0);
3375         put_symbol(&s->c, s->header_state, s->chroma_v_shift, 0);
3376         put_rac(&s->c, s->header_state, s->spatial_scalability);
3377 //        put_rac(&s->c, s->header_state, s->rate_scalability);
3378         put_symbol(&s->c, s->header_state, s->max_ref_frames-1, 0);
3379
3380         for(plane_index=0; plane_index<2; plane_index++){
3381             for(level=0; level<s->spatial_decomposition_count; level++){
3382                 for(orientation=level ? 1:0; orientation<4; orientation++){
3383                     if(orientation==2) continue;
3384                     put_symbol(&s->c, s->header_state, s->plane[plane_index].band[level][orientation].qlog, 1);
3385                 }
3386             }
3387         }
3388     }
3389     put_symbol(&s->c, s->header_state, s->spatial_decomposition_type - s->last_spatial_decomposition_type, 1);
3390     put_symbol(&s->c, s->header_state, s->qlog            - s->last_qlog    , 1);
3391     put_symbol(&s->c, s->header_state, s->mv_scale        - s->last_mv_scale, 1);
3392     put_symbol(&s->c, s->header_state, s->qbias           - s->last_qbias   , 1);
3393     put_symbol(&s->c, s->header_state, s->block_max_depth - s->last_block_max_depth, 1);
3394
3395     s->last_spatial_decomposition_type= s->spatial_decomposition_type;
3396     s->last_qlog                      = s->qlog;
3397     s->last_qbias                     = s->qbias;
3398     s->last_mv_scale                  = s->mv_scale;
3399     s->last_block_max_depth           = s->block_max_depth;
3400 }
3401
3402 static int decode_header(SnowContext *s){
3403     int plane_index, level, orientation;
3404     uint8_t kstate[32];
3405
3406     memset(kstate, MID_STATE, sizeof(kstate));
3407
3408     s->keyframe= get_rac(&s->c, kstate);
3409     if(s->keyframe || s->always_reset){
3410         reset_contexts(s);
3411         s->spatial_decomposition_type=
3412         s->qlog=
3413         s->qbias=
3414         s->mv_scale=
3415         s->block_max_depth= 0;
3416     }
3417     if(s->keyframe){
3418         s->version= get_symbol(&s->c, s->header_state, 0);
3419         if(s->version>0){
3420             av_log(s->avctx, AV_LOG_ERROR, "version %d not supported", s->version);
3421             return -1;
3422         }
3423         s->always_reset= get_rac(&s->c, s->header_state);
3424         s->temporal_decomposition_type= get_symbol(&s->c, s->header_state, 0);
3425         s->temporal_decomposition_count= get_symbol(&s->c, s->header_state, 0);
3426         s->spatial_decomposition_count= get_symbol(&s->c, s->header_state, 0);
3427         s->colorspace_type= get_symbol(&s->c, s->header_state, 0);
3428         s->chroma_h_shift= get_symbol(&s->c, s->header_state, 0);
3429         s->chroma_v_shift= get_symbol(&s->c, s->header_state, 0);
3430         s->spatial_scalability= get_rac(&s->c, s->header_state);
3431 //        s->rate_scalability= get_rac(&s->c, s->header_state);
3432         s->max_ref_frames= get_symbol(&s->c, s->header_state, 0)+1;
3433
3434         for(plane_index=0; plane_index<3; plane_index++){
3435             for(level=0; level<s->spatial_decomposition_count; level++){
3436                 for(orientation=level ? 1:0; orientation<4; orientation++){
3437                     int q;
3438                     if     (plane_index==2) q= s->plane[1].band[level][orientation].qlog;
3439                     else if(orientation==2) q= s->plane[plane_index].band[level][1].qlog;
3440                     else                    q= get_symbol(&s->c, s->header_state, 1);
3441                     s->plane[plane_index].band[level][orientation].qlog= q;
3442                 }
3443             }
3444         }
3445     }
3446
3447     s->spatial_decomposition_type+= get_symbol(&s->c, s->header_state, 1);
3448     if(s->spatial_decomposition_type > 1){
3449         av_log(s->avctx, AV_LOG_ERROR, "spatial_decomposition_type %d not supported", s->spatial_decomposition_type);
3450         return -1;
3451     }
3452
3453     s->qlog           += get_symbol(&s->c, s->header_state, 1);
3454     s->mv_scale       += get_symbol(&s->c, s->header_state, 1);
3455     s->qbias          += get_symbol(&s->c, s->header_state, 1);
3456     s->block_max_depth+= get_symbol(&s->c, s->header_state, 1);
3457     if(s->block_max_depth > 1 || s->block_max_depth < 0){
3458         av_log(s->avctx, AV_LOG_ERROR, "block_max_depth= %d is too large", s->block_max_depth);
3459         s->block_max_depth= 0;
3460         return -1;
3461     }
3462
3463     return 0;
3464 }
3465
3466 static void init_qexp(void){
3467     int i;
3468     double v=128;
3469
3470     for(i=0; i<QROOT; i++){
3471         qexp[i]= lrintf(v);
3472         v *= pow(2, 1.0 / QROOT);
3473     }
3474 }
3475
3476 static int common_init(AVCodecContext *avctx){
3477     SnowContext *s = avctx->priv_data;
3478     int width, height;
3479     int level, orientation, plane_index, dec;
3480     int i, j;
3481
3482     s->avctx= avctx;
3483
3484     dsputil_init(&s->dsp, avctx);
3485
3486 #define mcf(dx,dy)\
3487     s->dsp.put_qpel_pixels_tab       [0][dy+dx/4]=\
3488     s->dsp.put_no_rnd_qpel_pixels_tab[0][dy+dx/4]=\
3489         s->dsp.put_h264_qpel_pixels_tab[0][dy+dx/4];\
3490     s->dsp.put_qpel_pixels_tab       [1][dy+dx/4]=\
3491     s->dsp.put_no_rnd_qpel_pixels_tab[1][dy+dx/4]=\
3492         s->dsp.put_h264_qpel_pixels_tab[1][dy+dx/4];
3493
3494     mcf( 0, 0)
3495     mcf( 4, 0)
3496     mcf( 8, 0)
3497     mcf(12, 0)
3498     mcf( 0, 4)
3499     mcf( 4, 4)
3500     mcf( 8, 4)
3501     mcf(12, 4)
3502     mcf( 0, 8)
3503     mcf( 4, 8)
3504     mcf( 8, 8)
3505     mcf(12, 8)
3506     mcf( 0,12)
3507     mcf( 4,12)
3508     mcf( 8,12)
3509     mcf(12,12)
3510
3511 #define mcfh(dx,dy)\
3512     s->dsp.put_pixels_tab       [0][dy/4+dx/8]=\
3513     s->dsp.put_no_rnd_pixels_tab[0][dy/4+dx/8]=\
3514         mc_block_hpel ## dx ## dy ## 16;\
3515     s->dsp.put_pixels_tab       [1][dy/4+dx/8]=\
3516     s->dsp.put_no_rnd_pixels_tab[1][dy/4+dx/8]=\
3517         mc_block_hpel ## dx ## dy ## 8;
3518
3519     mcfh(0, 0)
3520     mcfh(8, 0)
3521     mcfh(0, 8)
3522     mcfh(8, 8)
3523
3524     if(!qexp[0])
3525         init_qexp();
3526
3527     dec= s->spatial_decomposition_count= 5;
3528     s->spatial_decomposition_type= avctx->prediction_method; //FIXME add decorrelator type r transform_type
3529
3530     s->chroma_h_shift= 1; //FIXME XXX
3531     s->chroma_v_shift= 1;
3532
3533 //    dec += FFMAX(s->chroma_h_shift, s->chroma_v_shift);
3534
3535     width= s->avctx->width;
3536     height= s->avctx->height;
3537
3538     s->spatial_idwt_buffer= av_mallocz(width*height*sizeof(IDWTELEM));
3539     s->spatial_dwt_buffer= av_mallocz(width*height*sizeof(DWTELEM)); //FIXME this doesnt belong here
3540
3541     s->mv_scale= (s->avctx->flags & CODEC_FLAG_QPEL) ? 2 : 4;
3542     s->block_max_depth= (s->avctx->flags & CODEC_FLAG_4MV) ? 1 : 0;
3543
3544     for(plane_index=0; plane_index<3; plane_index++){
3545         int w= s->avctx->width;
3546         int h= s->avctx->height;
3547
3548         if(plane_index){
3549             w>>= s->chroma_h_shift;
3550             h>>= s->chroma_v_shift;
3551         }
3552         s->plane[plane_index].width = w;
3553         s->plane[plane_index].height= h;
3554 //av_log(NULL, AV_LOG_DEBUG, "%d %d\n", w, h);
3555         for(level=s->spatial_decomposition_count-1; level>=0; level--){
3556             for(orientation=level ? 1 : 0; orientation<4; orientation++){
3557                 SubBand *b= &s->plane[plane_index].band[level][orientation];
3558
3559                 b->buf= s->spatial_dwt_buffer;
3560                 b->level= level;
3561                 b->stride= s->plane[plane_index].width << (s->spatial_decomposition_count - level);
3562                 b->width = (w + !(orientation&1))>>1;
3563                 b->height= (h + !(orientation>1))>>1;
3564
3565                 b->stride_line = 1 << (s->spatial_decomposition_count - level);
3566                 b->buf_x_offset = 0;
3567                 b->buf_y_offset = 0;
3568
3569                 if(orientation&1){
3570                     b->buf += (w+1)>>1;
3571                     b->buf_x_offset = (w+1)>>1;
3572                 }
3573                 if(orientation>1){
3574                     b->buf += b->stride>>1;
3575                     b->buf_y_offset = b->stride_line >> 1;
3576                 }
3577                 b->ibuf= s->spatial_idwt_buffer + (b->buf - s->spatial_dwt_buffer);
3578
3579                 if(level)
3580                     b->parent= &s->plane[plane_index].band[level-1][orientation];
3581                 b->x_coeff=av_mallocz(((b->width+1) * b->height+1)*sizeof(x_and_coeff));
3582             }
3583             w= (w+1)>>1;
3584             h= (h+1)>>1;
3585         }
3586     }
3587
3588     for(i=0; i<MAX_REF_FRAMES; i++)
3589         for(j=0; j<MAX_REF_FRAMES; j++)
3590             scale_mv_ref[i][j] = 256*(i+1)/(j+1);
3591
3592     reset_contexts(s);
3593 /*
3594     width= s->width= avctx->width;
3595     height= s->height= avctx->height;
3596
3597     assert(width && height);
3598 */
3599     s->avctx->get_buffer(s->avctx, &s->mconly_picture);
3600
3601     return 0;
3602 }
3603
3604 static int qscale2qlog(int qscale){
3605     return rint(QROOT*log(qscale / (float)FF_QP2LAMBDA)/log(2))
3606            + 61*QROOT/8; //<64 >60
3607 }
3608
3609 static int ratecontrol_1pass(SnowContext *s, AVFrame *pict)
3610 {
3611     /* estimate the frame's complexity as a sum of weighted dwt coefs.
3612      * FIXME we know exact mv bits at this point,
3613      * but ratecontrol isn't set up to include them. */
3614     uint32_t coef_sum= 0;
3615     int level, orientation, delta_qlog;
3616
3617     for(level=0; level<s->spatial_decomposition_count; level++){
3618         for(orientation=level ? 1 : 0; orientation<4; orientation++){
3619             SubBand *b= &s->plane[0].band[level][orientation];
3620             IDWTELEM *buf= b->ibuf;
3621             const int w= b->width;
3622             const int h= b->height;
3623             const int stride= b->stride;
3624             const int qlog= av_clip(2*QROOT + b->qlog, 0, QROOT*16);
3625             const int qmul= qexp[qlog&(QROOT-1)]<<(qlog>>QSHIFT);
3626             const int qdiv= (1<<16)/qmul;
3627             int x, y;
3628             //FIXME this is ugly
3629             for(y=0; y<h; y++)
3630                 for(x=0; x<w; x++)
3631                     buf[x+y*stride]= b->buf[x+y*stride];
3632             if(orientation==0)
3633                 decorrelate(s, b, buf, stride, 1, 0);
3634             for(y=0; y<h; y++)
3635                 for(x=0; x<w; x++)
3636                     coef_sum+= abs(buf[x+y*stride]) * qdiv >> 16;
3637         }
3638     }
3639
3640     /* ugly, ratecontrol just takes a sqrt again */
3641     coef_sum = (uint64_t)coef_sum * coef_sum >> 16;
3642     assert(coef_sum < INT_MAX);
3643
3644     if(pict->pict_type == I_TYPE){
3645         s->m.current_picture.mb_var_sum= coef_sum;
3646         s->m.current_picture.mc_mb_var_sum= 0;
3647     }else{
3648         s->m.current_picture.mc_mb_var_sum= coef_sum;
3649         s->m.current_picture.mb_var_sum= 0;
3650     }
3651
3652     pict->quality= ff_rate_estimate_qscale(&s->m, 1);
3653     if (pict->quality < 0)
3654         return INT_MIN;
3655     s->lambda= pict->quality * 3/2;
3656     delta_qlog= qscale2qlog(pict->quality) - s->qlog;
3657     s->qlog+= delta_qlog;
3658     return delta_qlog;
3659 }
3660
3661 static void calculate_vissual_weight(SnowContext *s, Plane *p){
3662     int width = p->width;
3663     int height= p->height;
3664     int level, orientation, x, y;
3665
3666     for(level=0; level<s->spatial_decomposition_count; level++){
3667         for(orientation=level ? 1 : 0; orientation<4; orientation++){
3668             SubBand *b= &p->band[level][orientation];
3669             IDWTELEM *ibuf= b->ibuf;
3670             int64_t error=0;
3671
3672             memset(s->spatial_idwt_buffer, 0, sizeof(*s->spatial_idwt_buffer)*width*height);
3673             ibuf[b->width/2 + b->height/2*b->stride]= 256*16;
3674             ff_spatial_idwt(s->spatial_idwt_buffer, width, height, width, s->spatial_decomposition_type, s->spatial_decomposition_count);
3675             for(y=0; y<height; y++){
3676                 for(x=0; x<width; x++){
3677                     int64_t d= s->spatial_idwt_buffer[x + y*width]*16;
3678                     error += d*d;
3679                 }
3680             }
3681
3682             b->qlog= (int)(log(352256.0/sqrt(error)) / log(pow(2.0, 1.0/QROOT))+0.5);
3683 //            av_log(NULL, AV_LOG_DEBUG, "%d %d %d\n", level, orientation, b->qlog/*, sqrt(error)*/);
3684         }
3685     }
3686 }
3687
3688 static int encode_init(AVCodecContext *avctx)
3689 {
3690     SnowContext *s = avctx->priv_data;
3691     int plane_index;
3692
3693     if(avctx->strict_std_compliance > FF_COMPLIANCE_EXPERIMENTAL){
3694         av_log(avctx, AV_LOG_ERROR, "this codec is under development, files encoded with it may not be decodable with future versions!!!\n"
3695                "use vstrict=-2 / -strict -2 to use it anyway\n");
3696         return -1;
3697     }
3698
3699     if(avctx->prediction_method == DWT_97
3700        && (avctx->flags & CODEC_FLAG_QSCALE)
3701        && avctx->global_quality == 0){
3702         av_log(avctx, AV_LOG_ERROR, "the 9/7 wavelet is incompatible with lossless mode\n");
3703         return -1;
3704     }
3705
3706     common_init(avctx);
3707     alloc_blocks(s);
3708
3709     s->version=0;
3710
3711     s->m.avctx   = avctx;
3712     s->m.flags   = avctx->flags;
3713     s->m.bit_rate= avctx->bit_rate;
3714
3715     s->m.me.scratchpad= av_mallocz((avctx->width+64)*2*16*2*sizeof(uint8_t));
3716     s->m.me.map       = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t));
3717     s->m.me.score_map = av_mallocz(ME_MAP_SIZE*sizeof(uint32_t));
3718     s->m.obmc_scratchpad= av_mallocz(MB_SIZE*MB_SIZE*12*sizeof(uint32_t));
3719     h263_encode_init(&s->m); //mv_penalty
3720
3721     s->max_ref_frames = FFMAX(FFMIN(avctx->refs, MAX_REF_FRAMES), 1);
3722
3723     if(avctx->flags&CODEC_FLAG_PASS1){
3724         if(!avctx->stats_out)
3725             avctx->stats_out = av_mallocz(256);
3726     }
3727     if((avctx->flags&CODEC_FLAG_PASS2) || !(avctx->flags&CODEC_FLAG_QSCALE)){
3728         if(ff_rate_control_init(&s->m) < 0)
3729             return -1;
3730     }
3731     s->pass1_rc= !(avctx->flags & (CODEC_FLAG_QSCALE|CODEC_FLAG_PASS2));
3732
3733     for(plane_index=0; plane_index<3; plane_index++){
3734         calculate_vissual_weight(s, &s->plane[plane_index]);
3735     }
3736
3737
3738     avctx->coded_frame= &s->current_picture;
3739     switch(avctx->pix_fmt){
3740 //    case PIX_FMT_YUV444P:
3741 //    case PIX_FMT_YUV422P:
3742     case PIX_FMT_YUV420P:
3743     case PIX_FMT_GRAY8:
3744 //    case PIX_FMT_YUV411P:
3745 //    case PIX_FMT_YUV410P:
3746         s->colorspace_type= 0;
3747         break;
3748 /*    case PIX_FMT_RGB32:
3749         s->colorspace= 1;
3750         break;*/
3751     default:
3752         av_log(avctx, AV_LOG_ERROR, "format not supported\n");
3753         return -1;
3754     }
3755 //    avcodec_get_chroma_sub_sample(avctx->pix_fmt, &s->chroma_h_shift, &s->chroma_v_shift);
3756     s->chroma_h_shift= 1;
3757     s->chroma_v_shift= 1;
3758
3759     ff_set_cmp(&s->dsp, s->dsp.me_cmp, s->avctx->me_cmp);
3760     ff_set_cmp(&s->dsp, s->dsp.me_sub_cmp, s->avctx->me_sub_cmp);
3761
3762     s->avctx->get_buffer(s->avctx, &s->input_picture);
3763
3764     if(s->avctx->me_method == ME_ITER){
3765         int i;
3766         int size= s->b_width * s->b_height << 2*s->block_max_depth;
3767         for(i=0; i<s->max_ref_frames; i++){
3768             s->ref_mvs[i]= av_mallocz(size*sizeof(int16_t[2]));
3769             s->ref_scores[i]= av_mallocz(size*sizeof(uint32_t));
3770         }
3771     }
3772
3773     return 0;
3774 }
3775
3776 static int frame_start(SnowContext *s){
3777    AVFrame tmp;
3778    int w= s->avctx->width; //FIXME round up to x16 ?
3779    int h= s->avctx->height;
3780
3781     if(s->current_picture.data[0]){
3782         draw_edges(s->current_picture.data[0], s->current_picture.linesize[0], w   , h   , EDGE_WIDTH  );
3783         draw_edges(s->current_picture.data[1], s->current_picture.linesize[1], w>>1, h>>1, EDGE_WIDTH/2);
3784         draw_edges(s->current_picture.data[2], s->current_picture.linesize[2], w>>1, h>>1, EDGE_WIDTH/2);
3785     }
3786
3787     tmp= s->last_picture[s->max_ref_frames-1];
3788     memmove(s->last_picture+1, s->last_picture, (s->max_ref_frames-1)*sizeof(AVFrame));
3789     s->last_picture[0]= s->current_picture;
3790     s->current_picture= tmp;
3791
3792     if(s->keyframe){
3793         s->ref_frames= 0;
3794     }else{
3795         int i;
3796         for(i=0; i<s->max_ref_frames && s->last_picture[i].data[0]; i++)
3797             if(i && s->last_picture[i-1].key_frame)
3798                 break;
3799         s->ref_frames= i;
3800     }
3801
3802     s->current_picture.reference= 1;
3803     if(s->avctx->get_buffer(s->avctx, &s->current_picture) < 0){
3804         av_log(s->avctx, AV_LOG_ERROR, "get_buffer() failed\n");
3805         return -1;
3806     }
3807
3808     s->current_picture.key_frame= s->keyframe;
3809
3810     return 0;
3811 }
3812
3813 static int encode_frame(AVCodecContext *avctx, unsigned char *buf, int buf_size, void *data){
3814     SnowContext *s = avctx->priv_data;
3815     RangeCoder * const c= &s->c;
3816     AVFrame *pict = data;
3817     const int width= s->avctx->width;
3818     const int height= s->avctx->height;
3819     int level, orientation, plane_index, i, y;
3820     uint8_t rc_header_bak[sizeof(s->header_state)];
3821     uint8_t rc_block_bak[sizeof(s->block_state)];
3822
3823     ff_init_range_encoder(c, buf, buf_size);
3824     ff_build_rac_states(c, 0.05*(1LL<<32), 256-8);
3825
3826     for(i=0; i<3; i++){
3827         int shift= !!i;
3828         for(y=0; y<(height>>shift); y++)
3829             memcpy(&s->input_picture.data[i][y * s->input_picture.linesize[i]],
3830                    &pict->data[i][y * pict->linesize[i]],
3831                    width>>shift);
3832     }
3833     s->new_picture = *pict;
3834
3835     s->m.picture_number= avctx->frame_number;
3836     if(avctx->flags&CODEC_FLAG_PASS2){
3837         s->m.pict_type =
3838         pict->pict_type= s->m.rc_context.entry[avctx->frame_number].new_pict_type;
3839         s->keyframe= pict->pict_type==FF_I_TYPE;
3840         if(!(avctx->flags&CODEC_FLAG_QSCALE)) {
3841             pict->quality= ff_rate_estimate_qscale(&s->m, 0);
3842             if (pict->quality < 0)
3843                 return -1;
3844         }
3845     }else{
3846         s->keyframe= avctx->gop_size==0 || avctx->frame_number % avctx->gop_size == 0;
3847         s->m.pict_type=
3848         pict->pict_type= s->keyframe ? FF_I_TYPE : FF_P_TYPE;
3849     }
3850
3851     if(s->pass1_rc && avctx->frame_number == 0)
3852         pict->quality= 2*FF_QP2LAMBDA;
3853     if(pict->quality){
3854         s->qlog= qscale2qlog(pict->quality);
3855         s->lambda = pict->quality * 3/2;
3856     }
3857     if(s->qlog < 0 || (!pict->quality && (avctx->flags & CODEC_FLAG_QSCALE))){
3858         s->qlog= LOSSLESS_QLOG;
3859         s->lambda = 0;
3860     }//else keep previous frame's qlog until after motion est
3861
3862     frame_start(s);
3863
3864     s->m.current_picture_ptr= &s->m.current_picture;
3865     if(pict->pict_type == P_TYPE){
3866         int block_width = (width +15)>>4;
3867         int block_height= (height+15)>>4;
3868         int stride= s->current_picture.linesize[0];
3869
3870         assert(s->current_picture.data[0]);
3871         assert(s->last_picture[0].data[0]);
3872
3873         s->m.avctx= s->avctx;
3874         s->m.current_picture.data[0]= s->current_picture.data[0];
3875         s->m.   last_picture.data[0]= s->last_picture[0].data[0];
3876         s->m.    new_picture.data[0]= s->  input_picture.data[0];
3877         s->m.   last_picture_ptr= &s->m.   last_picture;
3878         s->m.linesize=
3879         s->m.   last_picture.linesize[0]=
3880         s->m.    new_picture.linesize[0]=
3881         s->m.current_picture.linesize[0]= stride;
3882         s->m.uvlinesize= s->current_picture.linesize[1];
3883         s->m.width = width;
3884         s->m.height= height;
3885         s->m.mb_width = block_width;
3886         s->m.mb_height= block_height;
3887         s->m.mb_stride=   s->m.mb_width+1;
3888         s->m.b8_stride= 2*s->m.mb_width+1;
3889         s->m.f_code=1;
3890         s->m.pict_type= pict->pict_type;
3891         s->m.me_method= s->avctx->me_method;
3892         s->m.me.scene_change_score=0;
3893         s->m.flags= s->avctx->flags;
3894         s->m.quarter_sample= (s->avctx->flags & CODEC_FLAG_QPEL)!=0;
3895         s->m.out_format= FMT_H263;
3896         s->m.unrestricted_mv= 1;
3897
3898         s->m.lambda = s->lambda;
3899         s->m.qscale= (s->m.lambda*139 + FF_LAMBDA_SCALE*64) >> (FF_LAMBDA_SHIFT + 7);
3900         s->lambda2= s->m.lambda2= (s->m.lambda*s->m.lambda + FF_LAMBDA_SCALE/2) >> FF_LAMBDA_SHIFT;
3901
3902         s->m.dsp= s->dsp; //move
3903         ff_init_me(&s->m);
3904         s->dsp= s->m.dsp;
3905     }
3906
3907     if(s->pass1_rc){
3908         memcpy(rc_header_bak, s->header_state, sizeof(s->header_state));
3909         memcpy(rc_block_bak, s->block_state, sizeof(s->block_state));
3910     }
3911
3912 redo_frame:
3913
3914     s->m.pict_type = pict->pict_type;
3915     s->qbias= pict->pict_type == P_TYPE ? 2 : 0;
3916
3917     encode_header(s);
3918     s->m.misc_bits = 8*(s->c.bytestream - s->c.bytestream_start);
3919     encode_blocks(s, 1);
3920     s->m.mv_bits = 8*(s->c.bytestream - s->c.bytestream_start) - s->m.misc_bits;
3921
3922     for(plane_index=0; plane_index<3; plane_index++){
3923         Plane *p= &s->plane[plane_index];
3924         int w= p->width;
3925         int h= p->height;
3926         int x, y;
3927 //        int bits= put_bits_count(&s->c.pb);
3928
3929     if(!(avctx->flags2 & CODEC_FLAG2_MEMC_ONLY)){
3930         //FIXME optimize
3931      if(pict->data[plane_index]) //FIXME gray hack
3932         for(y=0; y<h; y++){
3933             for(x=0; x<w; x++){
3934                 s->spatial_idwt_buffer[y*w + x]= pict->data[plane_index][y*pict->linesize[plane_index] + x]<<FRAC_BITS;
3935             }
3936         }
3937         predict_plane(s, s->spatial_idwt_buffer, plane_index, 0);
3938
3939         if(   plane_index==0
3940            && pict->pict_type == P_TYPE
3941            && !(avctx->flags&CODEC_FLAG_PASS2)
3942            && s->m.me.scene_change_score > s->avctx->scenechange_threshold){
3943             ff_init_range_encoder(c, buf, buf_size);
3944             ff_build_rac_states(c, 0.05*(1LL<<32), 256-8);
3945             pict->pict_type= FF_I_TYPE;
3946             s->keyframe=1;
3947             s->current_picture.key_frame=1;
3948             goto redo_frame;
3949         }
3950
3951         if(s->qlog == LOSSLESS_QLOG){
3952             for(y=0; y<h; y++){
3953                 for(x=0; x<w; x++){
3954                     s->spatial_dwt_buffer[y*w + x]= (s->spatial_idwt_buffer[y*w + x] + (1<<(FRAC_BITS-1))-1)>>FRAC_BITS;
3955                 }
3956             }
3957         }else{
3958             for(y=0; y<h; y++){
3959                 for(x=0; x<w; x++){
3960                     s->spatial_dwt_buffer[y*w + x]=s->spatial_idwt_buffer[y*w + x]<<ENCODER_EXTRA_BITS;
3961                 }
3962             }
3963         }
3964
3965         ff_spatial_dwt(s->spatial_dwt_buffer, w, h, w, s->spatial_decomposition_type, s->spatial_decomposition_count);
3966
3967         if(s->pass1_rc && plane_index==0){
3968             int delta_qlog = ratecontrol_1pass(s, pict);
3969             if (delta_qlog <= INT_MIN)
3970                 return -1;
3971             if(delta_qlog){
3972                 //reordering qlog in the bitstream would eliminate this reset
3973                 ff_init_range_encoder(c, buf, buf_size);
3974                 memcpy(s->header_state, rc_header_bak, sizeof(s->header_state));
3975                 memcpy(s->block_state, rc_block_bak, sizeof(s->block_state));
3976                 encode_header(s);
3977                 encode_blocks(s, 0);
3978             }
3979         }
3980
3981         for(level=0; level<s->spatial_decomposition_count; level++){
3982             for(orientation=level ? 1 : 0; orientation<4; orientation++){
3983                 SubBand *b= &p->band[level][orientation];
3984
3985                 quantize(s, b, b->ibuf, b->buf, b->stride, s->qbias);
3986                 if(orientation==0)
3987                     decorrelate(s, b, b->ibuf, b->stride, pict->pict_type == P_TYPE, 0);
3988                 encode_subband(s, b, b->ibuf, b->parent ? b->parent->ibuf : NULL, b->stride, orientation);
3989                 assert(b->parent==NULL || b->parent->stride == b->stride*2);
3990                 if(orientation==0)
3991                     correlate(s, b, b->ibuf, b->stride, 1, 0);
3992             }
3993         }
3994 //        av_log(NULL, AV_LOG_DEBUG, "plane:%d bits:%d\n", plane_index, put_bits_count(&s->c.pb) - bits);
3995
3996         for(level=0; level<s->spatial_decomposition_count; level++){
3997             for(orientation=level ? 1 : 0; orientation<4; orientation++){
3998                 SubBand *b= &p->band[level][orientation];
3999
4000                 dequantize(s, b, b->ibuf, b->stride);
4001             }
4002         }
4003
4004         ff_spatial_idwt(s->spatial_idwt_buffer, w, h, w, s->spatial_decomposition_type, s->spatial_decomposition_count);
4005         if(s->qlog == LOSSLESS_QLOG){
4006             for(y=0; y<h; y++){
4007                 for(x=0; x<w; x++){
4008                     s->spatial_idwt_buffer[y*w + x]<<=FRAC_BITS;
4009                 }
4010             }
4011         }
4012 {START_TIMER
4013         predict_plane(s, s->spatial_idwt_buffer, plane_index, 1);
4014 STOP_TIMER("pred-conv")}
4015       }else{
4016             //ME/MC only
4017             if(pict->pict_type == I_TYPE){
4018                 for(y=0; y<h; y++){
4019                     for(x=0; x<w; x++){
4020                         s->current_picture.data[plane_index][y*s->current_picture.linesize[plane_index] + x]=
4021                             pict->data[plane_index][y*pict->linesize[plane_index] + x];
4022                     }
4023                 }
4024             }else{
4025                 memset(s->spatial_idwt_buffer, 0, sizeof(IDWTELEM)*w*h);
4026                 predict_plane(s, s->spatial_idwt_buffer, plane_index, 1);
4027             }
4028       }
4029         if(s->avctx->flags&CODEC_FLAG_PSNR){
4030             int64_t error= 0;
4031
4032     if(pict->data[plane_index]) //FIXME gray hack
4033             for(y=0; y<h; y++){
4034                 for(x=0; x<w; x++){
4035                     int d= s->current_picture.data[plane_index][y*s->current_picture.linesize[plane_index] + x] - pict->data[plane_index][y*pict->linesize[plane_index] + x];
4036                     error += d*d;
4037                 }
4038             }
4039             s->avctx->error[plane_index] += error;
4040             s->current_picture.error[plane_index] = error;
4041         }
4042     }
4043
4044     if(s->last_picture[s->max_ref_frames-1].data[0])
4045         avctx->release_buffer(avctx, &s->last_picture[s->max_ref_frames-1]);
4046
4047     s->current_picture.coded_picture_number = avctx->frame_number;
4048     s->current_picture.pict_type = pict->pict_type;
4049     s->current_picture.quality = pict->quality;
4050     s->m.frame_bits = 8*(s->c.bytestream - s->c.bytestream_start);
4051     s->m.p_tex_bits = s->m.frame_bits - s->m.misc_bits - s->m.mv_bits;
4052     s->m.current_picture.display_picture_number =
4053     s->m.current_picture.coded_picture_number = avctx->frame_number;
4054     s->m.current_picture.quality = pict->quality;
4055     s->m.total_bits += 8*(s->c.bytestream - s->c.bytestream_start);
4056     if(s->pass1_rc)
4057         if (ff_rate_estimate_qscale(&s->m, 0) < 0)
4058             return -1;
4059     if(avctx->flags&CODEC_FLAG_PASS1)
4060         ff_write_pass1_stats(&s->m);
4061     s->m.last_pict_type = s->m.pict_type;
4062     avctx->frame_bits = s->m.frame_bits;
4063     avctx->mv_bits = s->m.mv_bits;
4064     avctx->misc_bits = s->m.misc_bits;
4065     avctx->p_tex_bits = s->m.p_tex_bits;
4066
4067     emms_c();
4068
4069     return ff_rac_terminate(c);
4070 }
4071
4072 static void common_end(SnowContext *s){
4073     int plane_index, level, orientation, i;
4074
4075     av_freep(&s->spatial_dwt_buffer);
4076     av_freep(&s->spatial_idwt_buffer);
4077
4078     av_freep(&s->m.me.scratchpad);
4079     av_freep(&s->m.me.map);
4080     av_freep(&s->m.me.score_map);
4081     av_freep(&s->m.obmc_scratchpad);
4082
4083     av_freep(&s->block);
4084
4085     for(i=0; i<MAX_REF_FRAMES; i++){
4086         av_freep(&s->ref_mvs[i]);
4087         av_freep(&s->ref_scores[i]);
4088         if(s->last_picture[i].data[0])
4089             s->avctx->release_buffer(s->avctx, &s->last_picture[i]);
4090     }
4091
4092     for(plane_index=0; plane_index<3; plane_index++){
4093         for(level=s->spatial_decomposition_count-1; level>=0; level--){
4094             for(orientation=level ? 1 : 0; orientation<4; orientation++){
4095                 SubBand *b= &s->plane[plane_index].band[level][orientation];
4096
4097                 av_freep(&b->x_coeff);
4098             }
4099         }
4100     }
4101 }
4102
4103 static int encode_end(AVCodecContext *avctx)
4104 {
4105     SnowContext *s = avctx->priv_data;
4106
4107     common_end(s);
4108     av_free(avctx->stats_out);
4109
4110     return 0;
4111 }
4112
4113 static int decode_init(AVCodecContext *avctx)
4114 {
4115     SnowContext *s = avctx->priv_data;
4116     int block_size;
4117
4118     avctx->pix_fmt= PIX_FMT_YUV420P;
4119
4120     common_init(avctx);
4121
4122     block_size = MB_SIZE >> s->block_max_depth;
4123     slice_buffer_init(&s->sb, s->plane[0].height, (block_size) + (s->spatial_decomposition_count * (s->spatial_decomposition_count + 3)) + 1, s->plane[0].width, s->spatial_idwt_buffer);
4124
4125     return 0;
4126 }
4127
4128 static int decode_frame(AVCodecContext *avctx, void *data, int *data_size, uint8_t *buf, int buf_size){
4129     SnowContext *s = avctx->priv_data;
4130     RangeCoder * const c= &s->c;
4131     int bytes_read;
4132     AVFrame *picture = data;
4133     int level, orientation, plane_index;
4134
4135     ff_init_range_decoder(c, buf, buf_size);
4136     ff_build_rac_states(c, 0.05*(1LL<<32), 256-8);
4137
4138     s->current_picture.pict_type= FF_I_TYPE; //FIXME I vs. P
4139     decode_header(s);
4140     if(!s->block) alloc_blocks(s);
4141
4142     frame_start(s);
4143     //keyframe flag dupliaction mess FIXME
4144     if(avctx->debug&FF_DEBUG_PICT_INFO)
4145         av_log(avctx, AV_LOG_ERROR, "keyframe:%d qlog:%d\n", s->keyframe, s->qlog);
4146
4147     decode_blocks(s);
4148
4149     for(plane_index=0; plane_index<3; plane_index++){
4150         Plane *p= &s->plane[plane_index];
4151         int w= p->width;
4152         int h= p->height;
4153         int x, y;
4154         int decode_state[MAX_DECOMPOSITIONS][4][1]; /* Stored state info for unpack_coeffs. 1 variable per instance. */
4155
4156 if(s->avctx->debug&2048){
4157         memset(s->spatial_dwt_buffer, 0, sizeof(DWTELEM)*w*h);
4158         predict_plane(s, s->spatial_idwt_buffer, plane_index, 1);
4159
4160         for(y=0; y<h; y++){
4161             for(x=0; x<w; x++){
4162                 int v= s->current_picture.data[plane_index][y*s->current_picture.linesize[plane_index] + x];
4163                 s->mconly_picture.data[plane_index][y*s->mconly_picture.linesize[plane_index] + x]= v;
4164             }
4165         }
4166 }
4167
4168 {   START_TIMER
4169     for(level=0; level<s->spatial_decomposition_count; level++){
4170         for(orientation=level ? 1 : 0; orientation<4; orientation++){
4171             SubBand *b= &p->band[level][orientation];
4172             unpack_coeffs(s, b, b->parent, orientation);
4173         }
4174     }
4175     STOP_TIMER("unpack coeffs");
4176 }
4177
4178 {START_TIMER
4179     const int mb_h= s->b_height << s->block_max_depth;
4180     const int block_size = MB_SIZE >> s->block_max_depth;
4181     const int block_w    = plane_index ? block_size/2 : block_size;
4182     int mb_y;
4183     dwt_compose_t cs[MAX_DECOMPOSITIONS];
4184     int yd=0, yq=0;
4185     int y;
4186     int end_y;
4187
4188     ff_spatial_idwt_buffered_init(cs, &s->sb, w, h, 1, s->spatial_decomposition_type, s->spatial_decomposition_count);
4189     for(mb_y=0; mb_y<=mb_h; mb_y++){
4190
4191         int slice_starty = block_w*mb_y;
4192         int slice_h = block_w*(mb_y+1);
4193         if (!(s->keyframe || s->avctx->debug&512)){
4194             slice_starty = FFMAX(0, slice_starty - (block_w >> 1));
4195             slice_h -= (block_w >> 1);
4196         }
4197
4198         {
4199         START_TIMER
4200         for(level=0; level<s->spatial_decomposition_count; level++){
4201             for(orientation=level ? 1 : 0; orientation<4; orientation++){
4202                 SubBand *b= &p->band[level][orientation];
4203                 int start_y;
4204                 int end_y;
4205                 int our_mb_start = mb_y;
4206                 int our_mb_end = (mb_y + 1);
4207                 const int extra= 3;
4208                 start_y = (mb_y ? ((block_w * our_mb_start) >> (s->spatial_decomposition_count - level)) + s->spatial_decomposition_count - level + extra: 0);
4209                 end_y = (((block_w * our_mb_end) >> (s->spatial_decomposition_count - level)) + s->spatial_decomposition_count - level + extra);
4210                 if (!(s->keyframe || s->avctx->debug&512)){
4211                     start_y = FFMAX(0, start_y - (block_w >> (1+s->spatial_decomposition_count - level)));
4212                     end_y = FFMAX(0, end_y - (block_w >> (1+s->spatial_decomposition_count - level)));
4213                 }
4214                 start_y = FFMIN(b->height, start_y);
4215                 end_y = FFMIN(b->height, end_y);
4216
4217                 if (start_y != end_y){
4218                     if (orientation == 0){
4219                         SubBand * correlate_band = &p->band[0][0];
4220                         int correlate_end_y = FFMIN(b->height, end_y + 1);
4221                         int correlate_start_y = FFMIN(b->height, (start_y ? start_y + 1 : 0));
4222                         decode_subband_slice_buffered(s, correlate_band, &s->sb, correlate_start_y, correlate_end_y, decode_state[0][0]);
4223                         correlate_slice_buffered(s, &s->sb, correlate_band, correlate_band->ibuf, correlate_band->stride, 1, 0, correlate_start_y, correlate_end_y);
4224                         dequantize_slice_buffered(s, &s->sb, correlate_band, correlate_band->ibuf, correlate_band->stride, start_y, end_y);
4225                     }
4226                     else
4227                         decode_subband_slice_buffered(s, b, &s->sb, start_y, end_y, decode_state[level][orientation]);
4228                 }
4229             }
4230         }
4231         STOP_TIMER("decode_subband_slice");
4232         }
4233
4234 {   START_TIMER
4235         for(; yd<slice_h; yd+=4){
4236             ff_spatial_idwt_buffered_slice(&s->dsp, cs, &s->sb, w, h, 1, s->spatial_decomposition_type, s->spatial_decomposition_count, yd);
4237         }
4238     STOP_TIMER("idwt slice");}
4239
4240
4241         if(s->qlog == LOSSLESS_QLOG){
4242             for(; yq<slice_h && yq<h; yq++){
4243                 IDWTELEM * line = slice_buffer_get_line(&s->sb, yq);
4244                 for(x=0; x<w; x++){
4245                     line[x] <<= FRAC_BITS;
4246                 }
4247             }
4248         }
4249
4250         predict_slice_buffered(s, &s->sb, s->spatial_idwt_buffer, plane_index, 1, mb_y);
4251
4252         y = FFMIN(p->height, slice_starty);
4253         end_y = FFMIN(p->height, slice_h);
4254         while(y < end_y)
4255             slice_buffer_release(&s->sb, y++);
4256     }
4257
4258     slice_buffer_flush(&s->sb);
4259
4260 STOP_TIMER("idwt + predict_slices")}
4261     }
4262
4263     emms_c();
4264
4265     if(s->last_picture[s->max_ref_frames-1].data[0])
4266         avctx->release_buffer(avctx, &s->last_picture[s->max_ref_frames-1]);
4267
4268 if(!(s->avctx->debug&2048))
4269     *picture= s->current_picture;
4270 else
4271     *picture= s->mconly_picture;
4272
4273     *data_size = sizeof(AVFrame);
4274
4275     bytes_read= c->bytestream - c->bytestream_start;
4276     if(bytes_read ==0) av_log(s->avctx, AV_LOG_ERROR, "error at end of frame\n"); //FIXME
4277
4278     return bytes_read;
4279 }
4280
4281 static int decode_end(AVCodecContext *avctx)
4282 {
4283     SnowContext *s = avctx->priv_data;
4284
4285     slice_buffer_destroy(&s->sb);
4286
4287     common_end(s);
4288
4289     return 0;
4290 }
4291
4292 AVCodec snow_decoder = {
4293     "snow",
4294     CODEC_TYPE_VIDEO,
4295     CODEC_ID_SNOW,
4296     sizeof(SnowContext),
4297     decode_init,
4298     NULL,
4299     decode_end,
4300     decode_frame,
4301     0 /*CODEC_CAP_DR1*/ /*| CODEC_CAP_DRAW_HORIZ_BAND*/,
4302     NULL
4303 };
4304
4305 #ifdef CONFIG_SNOW_ENCODER
4306 AVCodec snow_encoder = {
4307     "snow",
4308     CODEC_TYPE_VIDEO,
4309     CODEC_ID_SNOW,
4310     sizeof(SnowContext),
4311     encode_init,
4312     encode_frame,
4313     encode_end,
4314 };
4315 #endif
4316
4317
4318 #if 0
4319 #undef malloc
4320 #undef free
4321 #undef printf
4322 #undef random
4323
4324 int main(){
4325     int width=256;
4326     int height=256;
4327     int buffer[2][width*height];
4328     SnowContext s;
4329     int i;
4330     s.spatial_decomposition_count=6;
4331     s.spatial_decomposition_type=1;
4332
4333     printf("testing 5/3 DWT\n");
4334     for(i=0; i<width*height; i++)
4335         buffer[0][i]= buffer[1][i]= random()%54321 - 12345;
4336
4337     ff_spatial_dwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4338     ff_spatial_idwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4339
4340     for(i=0; i<width*height; i++)
4341         if(buffer[0][i]!= buffer[1][i]) printf("fsck: %d %d %d\n",i, buffer[0][i], buffer[1][i]);
4342
4343     printf("testing 9/7 DWT\n");
4344     s.spatial_decomposition_type=0;
4345     for(i=0; i<width*height; i++)
4346         buffer[0][i]= buffer[1][i]= random()%54321 - 12345;
4347
4348     ff_spatial_dwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4349     ff_spatial_idwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4350
4351     for(i=0; i<width*height; i++)
4352         if(FFABS(buffer[0][i] - buffer[1][i])>20) printf("fsck: %d %d %d\n",i, buffer[0][i], buffer[1][i]);
4353
4354 #if 0
4355     printf("testing AC coder\n");
4356     memset(s.header_state, 0, sizeof(s.header_state));
4357     ff_init_range_encoder(&s.c, buffer[0], 256*256);
4358     ff_init_cabac_states(&s.c, ff_h264_lps_range, ff_h264_mps_state, ff_h264_lps_state, 64);
4359
4360     for(i=-256; i<256; i++){
4361 START_TIMER
4362         put_symbol(&s.c, s.header_state, i*i*i/3*FFABS(i), 1);
4363 STOP_TIMER("put_symbol")
4364     }
4365     ff_rac_terminate(&s.c);
4366
4367     memset(s.header_state, 0, sizeof(s.header_state));
4368     ff_init_range_decoder(&s.c, buffer[0], 256*256);
4369     ff_init_cabac_states(&s.c, ff_h264_lps_range, ff_h264_mps_state, ff_h264_lps_state, 64);
4370
4371     for(i=-256; i<256; i++){
4372         int j;
4373 START_TIMER
4374         j= get_symbol(&s.c, s.header_state, 1);
4375 STOP_TIMER("get_symbol")
4376         if(j!=i*i*i/3*FFABS(i)) printf("fsck: %d != %d\n", i, j);
4377     }
4378 #endif
4379 {
4380 int level, orientation, x, y;
4381 int64_t errors[8][4];
4382 int64_t g=0;
4383
4384     memset(errors, 0, sizeof(errors));
4385     s.spatial_decomposition_count=3;
4386     s.spatial_decomposition_type=0;
4387     for(level=0; level<s.spatial_decomposition_count; level++){
4388         for(orientation=level ? 1 : 0; orientation<4; orientation++){
4389             int w= width  >> (s.spatial_decomposition_count-level);
4390             int h= height >> (s.spatial_decomposition_count-level);
4391             int stride= width  << (s.spatial_decomposition_count-level);
4392             DWTELEM *buf= buffer[0];
4393             int64_t error=0;
4394
4395             if(orientation&1) buf+=w;
4396             if(orientation>1) buf+=stride>>1;
4397
4398             memset(buffer[0], 0, sizeof(int)*width*height);
4399             buf[w/2 + h/2*stride]= 256*256;
4400             ff_spatial_idwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4401             for(y=0; y<height; y++){
4402                 for(x=0; x<width; x++){
4403                     int64_t d= buffer[0][x + y*width];
4404                     error += d*d;
4405                     if(FFABS(width/2-x)<9 && FFABS(height/2-y)<9 && level==2) printf("%8"PRId64" ", d);
4406                 }
4407                 if(FFABS(height/2-y)<9 && level==2) printf("\n");
4408             }
4409             error= (int)(sqrt(error)+0.5);
4410             errors[level][orientation]= error;
4411             if(g) g=ff_gcd(g, error);
4412             else g= error;
4413         }
4414     }
4415     printf("static int const visual_weight[][4]={\n");
4416     for(level=0; level<s.spatial_decomposition_count; level++){
4417         printf("  {");
4418         for(orientation=0; orientation<4; orientation++){
4419             printf("%8"PRId64",", errors[level][orientation]/g);
4420         }
4421         printf("},\n");
4422     }
4423     printf("};\n");
4424     {
4425             int level=2;
4426             int orientation=3;
4427             int w= width  >> (s.spatial_decomposition_count-level);
4428             int h= height >> (s.spatial_decomposition_count-level);
4429             int stride= width  << (s.spatial_decomposition_count-level);
4430             DWTELEM *buf= buffer[0];
4431             int64_t error=0;
4432
4433             buf+=w;
4434             buf+=stride>>1;
4435
4436             memset(buffer[0], 0, sizeof(int)*width*height);
4437 #if 1
4438             for(y=0; y<height; y++){
4439                 for(x=0; x<width; x++){
4440                     int tab[4]={0,2,3,1};
4441                     buffer[0][x+width*y]= 256*256*tab[(x&1) + 2*(y&1)];
4442                 }
4443             }
4444             ff_spatial_dwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4445 #else
4446             for(y=0; y<h; y++){
4447                 for(x=0; x<w; x++){
4448                     buf[x + y*stride  ]=169;
4449                     buf[x + y*stride-w]=64;
4450                 }
4451             }
4452             ff_spatial_idwt(buffer[0], width, height, width, s.spatial_decomposition_type, s.spatial_decomposition_count);
4453 #endif
4454             for(y=0; y<height; y++){
4455                 for(x=0; x<width; x++){
4456                     int64_t d= buffer[0][x + y*width];
4457                     error += d*d;
4458                     if(FFABS(width/2-x)<9 && FFABS(height/2-y)<9) printf("%8"PRId64" ", d);
4459                 }
4460                 if(FFABS(height/2-y)<9) printf("\n");
4461             }
4462     }
4463
4464 }
4465     return 0;
4466 }
4467 #endif
4468