drivers/gpu/nvgpu/gk20a/cde_gk20a.c

   1 /*
   2  * Color decompression engine support
   3  *
   4  * Copyright (c) 2014-2015, NVIDIA Corporation.  All rights reserved.
   5  *
   6  * This program is free software; you can redistribute it and/or modify it
   7  * under the terms and conditions of the GNU General Public License,
   8  * version 2, as published by the Free Software Foundation.
   9  *
  10  * This program is distributed in the hope it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  13  * more details.
  14  *
  15  * You should have received a copy of the GNU General Public License
  16  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  17  */
  18
  19 #include <linux/dma-mapping.h>
  20 #include <linux/firmware.h>
  21 #include <linux/fs.h>
  22 #include <linux/debugfs.h>
  23 #include <linux/dma-buf.h>
  24
  25 #include <trace/events/gk20a.h>
  26
  27 #include "gk20a.h"
  28 #include "channel_gk20a.h"
  29 #include "mm_gk20a.h"
  30 #include "cde_gk20a.h"
  31 #include "fence_gk20a.h"
  32 #include "gr_gk20a.h"
  33 #include "debug_gk20a.h"
  34 #include "semaphore_gk20a.h"
  35
  36 #include "hw_ccsr_gk20a.h"
  37 #include "hw_pbdma_gk20a.h"
  38
  39 static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx);
  40 static struct gk20a_cde_ctx *gk20a_cde_allocate_context(struct gk20a *g);
  41
  42 #define CTX_DELETE_TIME 1000
  43
  44 #define MAX_CTX_USE_COUNT 42
  45 #define MAX_CTX_RETRY_TIME 2000
  46
  47 static void gk20a_deinit_cde_img(struct gk20a_cde_ctx *cde_ctx)
  48 {
  49         int i;
  50
  51         for (i = 0; i < cde_ctx->num_bufs; i++) {
  52                 struct mem_desc *mem = cde_ctx->mem + i;
  53                 gk20a_gmmu_unmap_free(cde_ctx->vm, mem);
  54         }
  55
  56         kfree(cde_ctx->init_convert_cmd);
  57
  58         cde_ctx->convert_cmd = NULL;
  59         cde_ctx->init_convert_cmd = NULL;
  60         cde_ctx->num_bufs = 0;
  61         cde_ctx->num_params = 0;
  62         cde_ctx->init_cmd_num_entries = 0;
  63         cde_ctx->convert_cmd_num_entries = 0;
  64         cde_ctx->init_cmd_executed = false;
  65 }
  66
  67 static void gk20a_cde_remove_ctx(struct gk20a_cde_ctx *cde_ctx)
  68 __must_hold(&cde_app->mutex)
  69 {
  70         struct gk20a *g = cde_ctx->g;
  71         struct channel_gk20a *ch = cde_ctx->ch;
  72         struct vm_gk20a *vm = ch->vm;
  73
  74         trace_gk20a_cde_remove_ctx(cde_ctx);
  75
  76         /* release mapped memory */
  77         gk20a_deinit_cde_img(cde_ctx);
  78         gk20a_gmmu_unmap(vm, cde_ctx->backing_store_vaddr,
  79                          g->gr.compbit_store.size, 1);
  80
  81         /* free the channel */
  82         gk20a_channel_close(ch);
  83
  84         /* housekeeping on app */
  85         list_del(&cde_ctx->list);
  86         cde_ctx->g->cde_app.ctx_count--;
  87         kfree(cde_ctx);
  88 }
  89
  90 static void gk20a_cde_cancel_deleter(struct gk20a_cde_ctx *cde_ctx,
  91                 bool wait_finish)
  92 __releases(&cde_app->mutex)
  93 __acquires(&cde_app->mutex)
  94 {
  95         struct gk20a_cde_app *cde_app = &cde_ctx->g->cde_app;
  96
  97         /* permanent contexts do not have deleter works */
  98         if (!cde_ctx->is_temporary)
  99                 return;
 100
 101         if (wait_finish) {
 102                 mutex_unlock(&cde_app->mutex);
 103                 cancel_delayed_work_sync(&cde_ctx->ctx_deleter_work);
 104                 mutex_lock(&cde_app->mutex);
 105         } else {
 106                 cancel_delayed_work(&cde_ctx->ctx_deleter_work);
 107         }
 108 }
 109
 110 static void gk20a_cde_remove_contexts(struct gk20a *g)
 111 __must_hold(&cde_app->mutex)
 112 {
 113         struct gk20a_cde_app *cde_app = &g->cde_app;
 114         struct gk20a_cde_ctx *cde_ctx, *cde_ctx_save;
 115
 116         /* safe to go off the mutex in cancel_deleter since app is
 117          * deinitialised; no new jobs are started. deleter works may be only at
 118          * waiting for the mutex or before, going to abort */
 119
 120         list_for_each_entry_safe(cde_ctx, cde_ctx_save,
 121                         &cde_app->free_contexts, list) {
 122                 gk20a_cde_cancel_deleter(cde_ctx, true);
 123                 gk20a_cde_remove_ctx(cde_ctx);
 124         }
 125
 126         list_for_each_entry_safe(cde_ctx, cde_ctx_save,
 127                         &cde_app->used_contexts, list) {
 128                 gk20a_cde_cancel_deleter(cde_ctx, true);
 129                 gk20a_cde_remove_ctx(cde_ctx);
 130         }
 131 }
 132
 133 static void gk20a_cde_stop(struct gk20a *g)
 134 __must_hold(&cde_app->mutex)
 135 {
 136         struct gk20a_cde_app *cde_app = &g->cde_app;
 137
 138         /* prevent further conversions and delayed works from working */
 139         cde_app->initialised = false;
 140         /* free all data, empty the list */
 141         gk20a_cde_remove_contexts(g);
 142 }
 143
 144 void gk20a_cde_destroy(struct gk20a *g)
 145 __acquires(&cde_app->mutex)
 146 __releases(&cde_app->mutex)
 147 {
 148         struct gk20a_cde_app *cde_app = &g->cde_app;
 149
 150         if (!cde_app->initialised)
 151                 return;
 152
 153         mutex_lock(&cde_app->mutex);
 154         gk20a_cde_stop(g);
 155         mutex_unlock(&cde_app->mutex);
 156 }
 157
 158 void gk20a_cde_suspend(struct gk20a *g)
 159 __acquires(&cde_app->mutex)
 160 __releases(&cde_app->mutex)
 161 {
 162         struct gk20a_cde_app *cde_app = &g->cde_app;
 163         struct gk20a_cde_ctx *cde_ctx, *cde_ctx_save;
 164
 165         if (!cde_app->initialised)
 166                 return;
 167
 168         mutex_lock(&cde_app->mutex);
 169
 170         list_for_each_entry_safe(cde_ctx, cde_ctx_save,
 171                         &cde_app->free_contexts, list) {
 172                 gk20a_cde_cancel_deleter(cde_ctx, false);
 173         }
 174
 175         list_for_each_entry_safe(cde_ctx, cde_ctx_save,
 176                         &cde_app->used_contexts, list) {
 177                 gk20a_cde_cancel_deleter(cde_ctx, false);
 178         }
 179
 180         mutex_unlock(&cde_app->mutex);
 181
 182 }
 183
 184 static int gk20a_cde_create_context(struct gk20a *g)
 185 __must_hold(&cde_app->mutex)
 186 {
 187         struct gk20a_cde_app *cde_app = &g->cde_app;
 188         struct gk20a_cde_ctx *cde_ctx;
 189
 190         cde_ctx = gk20a_cde_allocate_context(g);
 191         if (IS_ERR(cde_ctx))
 192                 return PTR_ERR(cde_ctx);
 193
 194         list_add(&cde_ctx->list, &cde_app->free_contexts);
 195         cde_app->ctx_count++;
 196         if (cde_app->ctx_count > cde_app->ctx_count_top)
 197                 cde_app->ctx_count_top = cde_app->ctx_count;
 198
 199         return 0;
 200 }
 201
 202 static int gk20a_cde_create_contexts(struct gk20a *g)
 203 __must_hold(&g->cde_app->mutex)
 204 {
 205         int err;
 206         int i;
 207
 208         for (i = 0; i < NUM_CDE_CONTEXTS; i++) {
 209                 err = gk20a_cde_create_context(g);
 210                 if (err)
 211                         goto out;
 212         }
 213
 214         return 0;
 215 out:
 216         gk20a_cde_remove_contexts(g);
 217         return err;
 218 }
 219
 220 static int gk20a_init_cde_buf(struct gk20a_cde_ctx *cde_ctx,
 221                               const struct firmware *img,
 222                               struct gk20a_cde_hdr_buf *buf)
 223 {
 224         struct mem_desc *mem;
 225         int err;
 226
 227         /* check that the file can hold the buf */
 228         if (buf->data_byte_offset != 0 &&
 229             buf->data_byte_offset + buf->num_bytes > img->size) {
 230                 gk20a_warn(&cde_ctx->pdev->dev, "cde: invalid data section. buffer idx = %d",
 231                            cde_ctx->num_bufs);
 232                 return -EINVAL;
 233         }
 234
 235         /* check that we have enough buf elems available */
 236         if (cde_ctx->num_bufs >= MAX_CDE_BUFS) {
 237                 gk20a_warn(&cde_ctx->pdev->dev, "cde: invalid data section. buffer idx = %d",
 238                            cde_ctx->num_bufs);
 239                 return -ENOMEM;
 240         }
 241
 242         /* allocate buf */
 243         mem = cde_ctx->mem + cde_ctx->num_bufs;
 244         err = gk20a_gmmu_alloc_map(cde_ctx->vm, buf->num_bytes, mem);
 245         if (err) {
 246                 gk20a_warn(&cde_ctx->pdev->dev, "cde: could not allocate device memory. buffer idx = %d",
 247                            cde_ctx->num_bufs);
 248                 return -ENOMEM;
 249         }
 250
 251         /* copy the content */
 252         if (buf->data_byte_offset != 0)
 253                 memcpy(mem->cpu_va, img->data + buf->data_byte_offset,
 254                        buf->num_bytes);
 255
 256         cde_ctx->num_bufs++;
 257
 258         return 0;
 259 }
 260
 261 static int gk20a_replace_data(struct gk20a_cde_ctx *cde_ctx, void *target,
 262                               int type, s32 shift, u64 mask, u64 value)
 263 {
 264         u32 *target_mem_ptr = target;
 265         u64 *target_mem_ptr_u64 = target;
 266         u64 current_value, new_value;
 267
 268         value = (shift >= 0) ? value << shift : value >> -shift;
 269         value &= mask;
 270
 271         /* read current data from the location */
 272         current_value = 0;
 273         if (type == TYPE_PARAM_TYPE_U32) {
 274                 if (mask != 0xfffffffful)
 275                         current_value = *target_mem_ptr;
 276         } else if (type == TYPE_PARAM_TYPE_U64_LITTLE) {
 277                 if (mask != ~0ul)
 278                         current_value = *target_mem_ptr_u64;
 279         } else if (type == TYPE_PARAM_TYPE_U64_BIG) {
 280                 current_value = *target_mem_ptr_u64;
 281                 current_value = (u64)(current_value >> 32) |
 282                         (u64)(current_value << 32);
 283         } else {
 284                 gk20a_warn(&cde_ctx->pdev->dev, "cde: unknown type. type=%d",
 285                            type);
 286                 return -EINVAL;
 287         }
 288
 289         current_value &= ~mask;
 290         new_value = current_value | value;
 291
 292         /* store the element data back */
 293         if (type == TYPE_PARAM_TYPE_U32)
 294                 *target_mem_ptr = (u32)new_value;
 295         else if (type == TYPE_PARAM_TYPE_U64_LITTLE)
 296                 *target_mem_ptr_u64 = new_value;
 297         else  {
 298                 new_value = (u64)(new_value >> 32) |
 299                         (u64)(new_value << 32);
 300                 *target_mem_ptr_u64 = new_value;
 301         }
 302
 303         return 0;
 304 }
 305
 306 static int gk20a_init_cde_replace(struct gk20a_cde_ctx *cde_ctx,
 307                                   const struct firmware *img,
 308                                   struct gk20a_cde_hdr_replace *replace)
 309 {
 310         struct mem_desc *source_mem;
 311         struct mem_desc *target_mem;
 312         u32 *target_mem_ptr;
 313         u64 vaddr;
 314         int err;
 315
 316         if (replace->target_buf >= cde_ctx->num_bufs ||
 317             replace->source_buf >= cde_ctx->num_bufs) {
 318                 gk20a_warn(&cde_ctx->pdev->dev, "cde: invalid buffer. target_buf=%u, source_buf=%u, num_bufs=%d",
 319                            replace->target_buf, replace->source_buf,
 320                            cde_ctx->num_bufs);
 321                 return -EINVAL;
 322         }
 323
 324         source_mem = cde_ctx->mem + replace->source_buf;
 325         target_mem = cde_ctx->mem + replace->target_buf;
 326         target_mem_ptr = target_mem->cpu_va;
 327
 328         if (source_mem->size < (replace->source_byte_offset + 3) ||
 329             target_mem->size < (replace->target_byte_offset + 3)) {
 330                 gk20a_warn(&cde_ctx->pdev->dev, "cde: invalid buffer offsets. target_buf_offs=%lld, source_buf_offs=%lld, source_buf_size=%zu, dest_buf_size=%zu",
 331                            replace->target_byte_offset,
 332                            replace->source_byte_offset,
 333                          source_mem->size,
 334                          target_mem->size);
 335                 return -EINVAL;
 336         }
 337
 338         /* calculate the target pointer */
 339         target_mem_ptr += (replace->target_byte_offset / sizeof(u32));
 340
 341         /* determine patch value */
 342         vaddr = source_mem->gpu_va + replace->source_byte_offset;
 343         err = gk20a_replace_data(cde_ctx, target_mem_ptr, replace->type,
 344                                  replace->shift, replace->mask,
 345                                  vaddr);
 346         if (err) {
 347                 gk20a_warn(&cde_ctx->pdev->dev, "cde: replace failed. err=%d, target_buf=%u, target_buf_offs=%lld, source_buf=%u, source_buf_offs=%lld",
 348                            err, replace->target_buf,
 349                            replace->target_byte_offset,
 350                            replace->source_buf,
 351                            replace->source_byte_offset);
 352         }
 353
 354         return err;
 355 }
 356
 357 static int gk20a_cde_patch_params(struct gk20a_cde_ctx *cde_ctx)
 358 {
 359         struct gk20a *g = cde_ctx->g;
 360         struct mem_desc *target_mem;
 361         u32 *target_mem_ptr;
 362         u64 new_data;
 363         int user_id = 0, i, err;
 364
 365         for (i = 0; i < cde_ctx->num_params; i++) {
 366                 struct gk20a_cde_hdr_param *param = cde_ctx->params + i;
 367                 target_mem = cde_ctx->mem + param->target_buf;
 368                 target_mem_ptr = target_mem->cpu_va;
 369                 target_mem_ptr += (param->target_byte_offset / sizeof(u32));
 370
 371                 switch (param->id) {
 372                 case TYPE_PARAM_COMPTAGS_PER_CACHELINE:
 373                         new_data = g->gr.comptags_per_cacheline;
 374                         break;
 375                 case TYPE_PARAM_GPU_CONFIGURATION:
 376                         new_data = g->ltc_count * g->gr.slices_per_ltc *
 377                                 g->gr.cacheline_size;
 378                         break;
 379                 case TYPE_PARAM_FIRSTPAGEOFFSET:
 380                         new_data = cde_ctx->surf_param_offset;
 381                         break;
 382                 case TYPE_PARAM_NUMPAGES:
 383                         new_data = cde_ctx->surf_param_lines;
 384                         break;
 385                 case TYPE_PARAM_BACKINGSTORE:
 386                         new_data = cde_ctx->backing_store_vaddr;
 387                         break;
 388                 case TYPE_PARAM_DESTINATION:
 389                         new_data = cde_ctx->compbit_vaddr;
 390                         break;
 391                 case TYPE_PARAM_DESTINATION_SIZE:
 392                         new_data = cde_ctx->compbit_size;
 393                         break;
 394                 case TYPE_PARAM_BACKINGSTORE_SIZE:
 395                         new_data = g->gr.compbit_store.size;
 396                         break;
 397                 case TYPE_PARAM_SOURCE_SMMU_ADDR:
 398                         new_data = gk20a_mm_gpuva_to_iova_base(cde_ctx->vm,
 399                                                         cde_ctx->surf_vaddr);
 400                         if (new_data == 0)
 401                                 err = -EINVAL;
 402                         break;
 403                 case TYPE_PARAM_BACKINGSTORE_BASE_HW:
 404                         new_data = g->gr.compbit_store.base_hw;
 405                         break;
 406                 case TYPE_PARAM_GOBS_PER_COMPTAGLINE_PER_SLICE:
 407                         new_data = g->gr.gobs_per_comptagline_per_slice;
 408                         break;
 409                 default:
 410                         user_id = param->id - NUM_RESERVED_PARAMS;
 411                         if (user_id < 0 || user_id >= MAX_CDE_USER_PARAMS)
 412                                 continue;
 413                         new_data = cde_ctx->user_param_values[user_id];
 414                 }
 415
 416                 gk20a_dbg(gpu_dbg_cde, "cde: patch: idx_in_file=%d      param_id=%d     target_buf=%u   target_byte_offset=%lld data_value=0x%llx       data_offset/data_diff=%lld      data_type=%d    data_shift=%d   data_mask=0x%llx",
 417                           i, param->id, param->target_buf,
 418                           param->target_byte_offset, new_data,
 419                           param->data_offset, param->type, param->shift,
 420                           param->mask);
 421
 422                 new_data += param->data_offset;
 423
 424                 err = gk20a_replace_data(cde_ctx, target_mem_ptr, param->type,
 425                                          param->shift, param->mask, new_data);
 426
 427                 if (err) {
 428                         gk20a_warn(&cde_ctx->pdev->dev, "cde: patch failed. err=%d, idx=%d, id=%d, target_buf=%u, target_buf_offs=%lld, patch_value=%llu",
 429                                    err, i, param->id, param->target_buf,
 430                                    param->target_byte_offset, new_data);
 431                         return err;
 432                 }
 433         }
 434
 435         return 0;
 436 }
 437
 438 static int gk20a_init_cde_param(struct gk20a_cde_ctx *cde_ctx,
 439                                 const struct firmware *img,
 440                                 struct gk20a_cde_hdr_param *param)
 441 {
 442         struct mem_desc *target_mem;
 443
 444         if (param->target_buf >= cde_ctx->num_bufs) {
 445                 gk20a_warn(&cde_ctx->pdev->dev, "cde: invalid buffer parameter. param idx = %d, target_buf=%u, num_bufs=%u",
 446                            cde_ctx->num_params, param->target_buf,
 447                            cde_ctx->num_bufs);
 448                 return -EINVAL;
 449         }
 450
 451         target_mem = cde_ctx->mem + param->target_buf;
 452         if (target_mem->size< (param->target_byte_offset + 3)) {
 453                 gk20a_warn(&cde_ctx->pdev->dev, "cde: invalid buffer parameter. param idx = %d, target_buf_offs=%lld, target_buf_size=%zu",
 454                            cde_ctx->num_params, param->target_byte_offset,
 455                            target_mem->size);
 456                 return -EINVAL;
 457         }
 458
 459         /* does this parameter fit into our parameter structure */
 460         if (cde_ctx->num_params >= MAX_CDE_PARAMS) {
 461                 gk20a_warn(&cde_ctx->pdev->dev, "cde: no room for new parameters param idx = %d",
 462                            cde_ctx->num_params);
 463                 return -ENOMEM;
 464         }
 465
 466         /* is the given id valid? */
 467         if (param->id >= NUM_RESERVED_PARAMS + MAX_CDE_USER_PARAMS) {
 468                 gk20a_warn(&cde_ctx->pdev->dev, "cde: parameter id is not valid. param idx = %d, id=%u, max=%u",
 469                            param->id, cde_ctx->num_params,
 470                            NUM_RESERVED_PARAMS + MAX_CDE_USER_PARAMS);
 471                 return -EINVAL;
 472         }
 473
 474         cde_ctx->params[cde_ctx->num_params] = *param;
 475         cde_ctx->num_params++;
 476
 477         return 0;
 478 }
 479
 480 static int gk20a_init_cde_required_class(struct gk20a_cde_ctx *cde_ctx,
 481                                          const struct firmware *img,
 482                                          u32 required_class)
 483 {
 484         struct nvgpu_alloc_obj_ctx_args alloc_obj_ctx;
 485         int err;
 486
 487         alloc_obj_ctx.class_num = required_class;
 488         alloc_obj_ctx.flags = 0;
 489
 490         /* CDE enabled */
 491         cde_ctx->ch->cde = 1;
 492
 493         err = gk20a_alloc_obj_ctx(cde_ctx->ch, &alloc_obj_ctx);
 494         if (err) {
 495                 gk20a_warn(&cde_ctx->pdev->dev, "cde: failed to allocate ctx. err=%d",
 496                            err);
 497                 return err;
 498         }
 499
 500         return 0;
 501 }
 502
 503 static int gk20a_init_cde_command(struct gk20a_cde_ctx *cde_ctx,
 504                                   const struct firmware *img,
 505                                   u32 op,
 506                                   struct gk20a_cde_cmd_elem *cmd_elem,
 507                                   u32 num_elems)
 508 {
 509         struct nvgpu_gpfifo **gpfifo, *gpfifo_elem;
 510         u32 *num_entries;
 511         int i;
 512
 513         /* check command type */
 514         if (op == TYPE_BUF_COMMAND_INIT) {
 515                 gpfifo = &cde_ctx->init_convert_cmd;
 516                 num_entries = &cde_ctx->init_cmd_num_entries;
 517         } else if (op == TYPE_BUF_COMMAND_CONVERT) {
 518                 gpfifo = &cde_ctx->convert_cmd;
 519                 num_entries = &cde_ctx->convert_cmd_num_entries;
 520         } else {
 521                 gk20a_warn(&cde_ctx->pdev->dev, "cde: unknown command. op=%u",
 522                            op);
 523                 return -EINVAL;
 524         }
 525
 526         /* allocate gpfifo entries to be pushed */
 527         *gpfifo = kzalloc(sizeof(struct nvgpu_gpfifo) * num_elems,
 528                           GFP_KERNEL);
 529         if (!*gpfifo) {
 530                 gk20a_warn(&cde_ctx->pdev->dev, "cde: could not allocate memory for gpfifo entries");
 531                 return -ENOMEM;
 532         }
 533
 534         gpfifo_elem = *gpfifo;
 535         for (i = 0; i < num_elems; i++, cmd_elem++, gpfifo_elem++) {
 536                 struct mem_desc *target_mem;
 537
 538                 /* validate the current entry */
 539                 if (cmd_elem->target_buf >= cde_ctx->num_bufs) {
 540                         gk20a_warn(&cde_ctx->pdev->dev, "cde: target buffer is not available (target=%u, num_bufs=%u)",
 541                                    cmd_elem->target_buf, cde_ctx->num_bufs);
 542                         return -EINVAL;
 543                 }
 544
 545                 target_mem = cde_ctx->mem + cmd_elem->target_buf;
 546                 if (target_mem->size<
 547                     cmd_elem->target_byte_offset + cmd_elem->num_bytes) {
 548                         gk20a_warn(&cde_ctx->pdev->dev, "cde: target buffer cannot hold all entries (target_size=%zu, target_byte_offset=%lld, num_bytes=%llu)",
 549                                    target_mem->size,
 550                                    cmd_elem->target_byte_offset,
 551                                    cmd_elem->num_bytes);
 552                         return -EINVAL;
 553                 }
 554
 555                 /* store the element into gpfifo */
 556                 gpfifo_elem->entry0 =
 557                         u64_lo32(target_mem->gpu_va +
 558                         cmd_elem->target_byte_offset);
 559                 gpfifo_elem->entry1 =
 560                         u64_hi32(target_mem->gpu_va +
 561                         cmd_elem->target_byte_offset) |
 562                         pbdma_gp_entry1_length_f(cmd_elem->num_bytes /
 563                                                  sizeof(u32));
 564         }
 565
 566         *num_entries = num_elems;
 567         return 0;
 568 }
 569
 570 static int gk20a_cde_pack_cmdbufs(struct gk20a_cde_ctx *cde_ctx)
 571 {
 572         unsigned long init_bytes = cde_ctx->init_cmd_num_entries *
 573                 sizeof(struct nvgpu_gpfifo);
 574         unsigned long conv_bytes = cde_ctx->convert_cmd_num_entries *
 575                 sizeof(struct nvgpu_gpfifo);
 576         unsigned long total_bytes = init_bytes + conv_bytes;
 577         struct nvgpu_gpfifo *combined_cmd;
 578
 579         /* allocate buffer that has space for both */
 580         combined_cmd = kzalloc(total_bytes, GFP_KERNEL);
 581         if (!combined_cmd) {
 582                 gk20a_warn(&cde_ctx->pdev->dev,
 583                                 "cde: could not allocate memory for gpfifo entries");
 584                 return -ENOMEM;
 585         }
 586
 587         /* move the original init here and append convert */
 588         memcpy(combined_cmd, cde_ctx->init_convert_cmd, init_bytes);
 589         memcpy(combined_cmd + cde_ctx->init_cmd_num_entries,
 590                         cde_ctx->convert_cmd, conv_bytes);
 591
 592         kfree(cde_ctx->init_convert_cmd);
 593         kfree(cde_ctx->convert_cmd);
 594
 595         cde_ctx->init_convert_cmd = combined_cmd;
 596         cde_ctx->convert_cmd = combined_cmd
 597                 + cde_ctx->init_cmd_num_entries;
 598
 599         return 0;
 600 }
 601
 602 static int gk20a_init_cde_img(struct gk20a_cde_ctx *cde_ctx,
 603                               const struct firmware *img)
 604 {
 605         struct gk20a_cde_app *cde_app = &cde_ctx->g->cde_app;
 606         u32 *data = (u32 *)img->data;
 607         u32 num_of_elems;
 608         struct gk20a_cde_hdr_elem *elem;
 609         u32 min_size = 0;
 610         int err = 0;
 611         int i;
 612
 613         min_size += 2 * sizeof(u32);
 614         if (img->size < min_size) {
 615                 gk20a_warn(&cde_ctx->pdev->dev, "cde: invalid image header");
 616                 return -EINVAL;
 617         }
 618
 619         cde_app->firmware_version = data[0];
 620         num_of_elems = data[1];
 621
 622         min_size += num_of_elems * sizeof(*elem);
 623         if (img->size < min_size) {
 624                 gk20a_warn(&cde_ctx->pdev->dev, "cde: bad image");
 625                 return -EINVAL;
 626         }
 627
 628         elem = (struct gk20a_cde_hdr_elem *)&data[2];
 629         for (i = 0; i < num_of_elems; i++) {
 630                 int err = 0;
 631                 switch (elem->type) {
 632                 case TYPE_BUF:
 633                         err = gk20a_init_cde_buf(cde_ctx, img, &elem->buf);
 634                         break;
 635                 case TYPE_REPLACE:
 636                         err = gk20a_init_cde_replace(cde_ctx, img,
 637                                                      &elem->replace);
 638                         break;
 639                 case TYPE_PARAM:
 640                         err = gk20a_init_cde_param(cde_ctx, img, &elem->param);
 641                         break;
 642                 case TYPE_REQUIRED_CLASS:
 643                         err = gk20a_init_cde_required_class(cde_ctx, img,
 644                                 elem->required_class);
 645                         break;
 646                 case TYPE_COMMAND:
 647                 {
 648                         struct gk20a_cde_cmd_elem *cmd = (void *)
 649                                 &img->data[elem->command.data_byte_offset];
 650                         err = gk20a_init_cde_command(cde_ctx, img,
 651                                 elem->command.op, cmd,
 652                                 elem->command.num_entries);
 653                         break;
 654                 }
 655                 case TYPE_ARRAY:
 656                         memcpy(&cde_app->arrays[elem->array.id][0],
 657                                 elem->array.data,
 658                                 MAX_CDE_ARRAY_ENTRIES*sizeof(u32));
 659                         break;
 660                 default:
 661                         gk20a_warn(&cde_ctx->pdev->dev, "cde: unknown header element");
 662                         err = -EINVAL;
 663                 }
 664
 665                 if (err)
 666                         goto deinit_image;
 667
 668                 elem++;
 669         }
 670
 671         if (!cde_ctx->init_convert_cmd || !cde_ctx->init_cmd_num_entries) {
 672                 gk20a_warn(&cde_ctx->pdev->dev, "cde: convert command not defined");
 673                 err = -EINVAL;
 674                 goto deinit_image;
 675         }
 676
 677         if (!cde_ctx->convert_cmd || !cde_ctx->convert_cmd_num_entries) {
 678                 gk20a_warn(&cde_ctx->pdev->dev, "cde: convert command not defined");
 679                 err = -EINVAL;
 680                 goto deinit_image;
 681         }
 682
 683         err = gk20a_cde_pack_cmdbufs(cde_ctx);
 684         if (err)
 685                 goto deinit_image;
 686
 687         return 0;
 688
 689 deinit_image:
 690         gk20a_deinit_cde_img(cde_ctx);
 691         return err;
 692 }
 693
 694 static int gk20a_cde_execute_buffer(struct gk20a_cde_ctx *cde_ctx,
 695                                     u32 op, struct nvgpu_fence *fence,
 696                                     u32 flags, struct gk20a_fence **fence_out)
 697 {
 698         struct nvgpu_gpfifo *gpfifo = NULL;
 699         int num_entries = 0;
 700
 701         /* check command type */
 702         if (op == TYPE_BUF_COMMAND_INIT) {
 703                 /* both init and convert combined */
 704                 gpfifo = cde_ctx->init_convert_cmd;
 705                 num_entries = cde_ctx->init_cmd_num_entries
 706                         + cde_ctx->convert_cmd_num_entries;
 707         } else if (op == TYPE_BUF_COMMAND_CONVERT) {
 708                 gpfifo = cde_ctx->convert_cmd;
 709                 num_entries = cde_ctx->convert_cmd_num_entries;
 710         } else {
 711                 gk20a_warn(&cde_ctx->pdev->dev, "cde: unknown buffer");
 712                 return -EINVAL;
 713         }
 714
 715         if (gpfifo == NULL || num_entries == 0) {
 716                 gk20a_warn(&cde_ctx->pdev->dev, "cde: buffer not available");
 717                 return -ENOSYS;
 718         }
 719
 720         return gk20a_submit_channel_gpfifo(cde_ctx->ch, gpfifo,
 721                                            num_entries, flags, fence, fence_out);
 722 }
 723
 724 static void gk20a_cde_ctx_release(struct gk20a_cde_ctx *cde_ctx)
 725 __acquires(&cde_app->mutex)
 726 __releases(&cde_app->mutex)
 727 {
 728         struct gk20a_cde_app *cde_app = &cde_ctx->g->cde_app;
 729
 730         gk20a_dbg(gpu_dbg_cde_ctx, "releasing use on %p", cde_ctx);
 731         trace_gk20a_cde_release(cde_ctx);
 732
 733         mutex_lock(&cde_app->mutex);
 734
 735         if (cde_ctx->in_use) {
 736                 cde_ctx->in_use = false;
 737                 list_move(&cde_ctx->list, &cde_app->free_contexts);
 738                 cde_app->ctx_usecount--;
 739         } else {
 740                 gk20a_dbg_info("double release cde context %p", cde_ctx);
 741         }
 742
 743         mutex_unlock(&cde_app->mutex);
 744 }
 745
 746 static void gk20a_cde_ctx_deleter_fn(struct work_struct *work)
 747 __acquires(&cde_app->mutex)
 748 __releases(&cde_app->mutex)
 749 {
 750         struct delayed_work *delay_work = to_delayed_work(work);
 751         struct gk20a_cde_ctx *cde_ctx = container_of(delay_work,
 752                         struct gk20a_cde_ctx, ctx_deleter_work);
 753         struct gk20a_cde_app *cde_app = &cde_ctx->g->cde_app;
 754         struct platform_device *pdev = cde_ctx->pdev;
 755         int err;
 756
 757         /* someone has just taken it? engine deletion started? */
 758         if (cde_ctx->in_use || !cde_app->initialised)
 759                 return;
 760
 761         gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx,
 762                         "cde: attempting to delete temporary %p", cde_ctx);
 763
 764         err = gk20a_busy(pdev);
 765         if (err) {
 766                 /* this context would find new use anyway later, so not freeing
 767                  * here does not leak anything */
 768                 gk20a_warn(&pdev->dev, "cde: cannot set gk20a on, postponing"
 769                                 " temp ctx deletion");
 770                 return;
 771         }
 772
 773         mutex_lock(&cde_app->mutex);
 774         if (cde_ctx->in_use || !cde_app->initialised) {
 775                 gk20a_dbg(gpu_dbg_cde_ctx,
 776                                 "cde: context use raced, not deleting %p",
 777                                 cde_ctx);
 778                 goto out;
 779         }
 780
 781         WARN(delayed_work_pending(&cde_ctx->ctx_deleter_work),
 782                         "double pending %p", cde_ctx);
 783
 784         gk20a_cde_remove_ctx(cde_ctx);
 785         gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx,
 786                         "cde: destroyed %p count=%d use=%d max=%d",
 787                         cde_ctx, cde_app->ctx_count, cde_app->ctx_usecount,
 788                         cde_app->ctx_count_top);
 789
 790 out:
 791         mutex_unlock(&cde_app->mutex);
 792         gk20a_idle(pdev);
 793 }
 794
 795 static struct gk20a_cde_ctx *gk20a_cde_do_get_context(struct gk20a *g)
 796 __must_hold(&cde_app->mutex)
 797 {
 798         struct gk20a_cde_app *cde_app = &g->cde_app;
 799         struct gk20a_cde_ctx *cde_ctx;
 800
 801         /* exhausted? */
 802
 803         if (cde_app->ctx_usecount >= MAX_CTX_USE_COUNT)
 804                 return ERR_PTR(-EAGAIN);
 805
 806         /* idle context available? */
 807
 808         if (!list_empty(&cde_app->free_contexts)) {
 809                 cde_ctx = list_first_entry(&cde_app->free_contexts,
 810                                 struct gk20a_cde_ctx, list);
 811                 gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx,
 812                                 "cde: got free %p count=%d use=%d max=%d",
 813                                 cde_ctx, cde_app->ctx_count,
 814                                 cde_app->ctx_usecount,
 815                                 cde_app->ctx_count_top);
 816                 trace_gk20a_cde_get_context(cde_ctx);
 817
 818                 /* deleter work may be scheduled, but in_use prevents it */
 819                 cde_ctx->in_use = true;
 820                 list_move(&cde_ctx->list, &cde_app->used_contexts);
 821                 cde_app->ctx_usecount++;
 822
 823                 /* cancel any deletions now that ctx is in use */
 824                 gk20a_cde_cancel_deleter(cde_ctx, true);
 825                 return cde_ctx;
 826         }
 827
 828         /* no free contexts, get a temporary one */
 829
 830         gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx,
 831                         "cde: no free contexts, count=%d",
 832                         cde_app->ctx_count);
 833
 834         cde_ctx = gk20a_cde_allocate_context(g);
 835         if (IS_ERR(cde_ctx)) {
 836                 gk20a_warn(&g->dev->dev, "cde: cannot allocate context: %ld",
 837                                 PTR_ERR(cde_ctx));
 838                 return cde_ctx;
 839         }
 840
 841         trace_gk20a_cde_get_context(cde_ctx);
 842         cde_ctx->in_use = true;
 843         cde_ctx->is_temporary = true;
 844         cde_app->ctx_usecount++;
 845         cde_app->ctx_count++;
 846         if (cde_app->ctx_count > cde_app->ctx_count_top)
 847                 cde_app->ctx_count_top = cde_app->ctx_count;
 848         list_add(&cde_ctx->list, &cde_app->used_contexts);
 849
 850         return cde_ctx;
 851 }
 852
 853 static struct gk20a_cde_ctx *gk20a_cde_get_context(struct gk20a *g)
 854 __releases(&cde_app->mutex)
 855 __acquires(&cde_app->mutex)
 856 {
 857         struct gk20a_cde_app *cde_app = &g->cde_app;
 858         struct gk20a_cde_ctx *cde_ctx = NULL;
 859         unsigned long end = jiffies + msecs_to_jiffies(MAX_CTX_RETRY_TIME);
 860
 861         do {
 862                 cde_ctx = gk20a_cde_do_get_context(g);
 863                 if (PTR_ERR(cde_ctx) != -EAGAIN)
 864                         break;
 865
 866                 /* exhausted, retry */
 867                 mutex_unlock(&cde_app->mutex);
 868                 cond_resched();
 869                 mutex_lock(&cde_app->mutex);
 870         } while (time_before(jiffies, end));
 871
 872         return cde_ctx;
 873 }
 874
 875 static struct gk20a_cde_ctx *gk20a_cde_allocate_context(struct gk20a *g)
 876 {
 877         struct gk20a_cde_ctx *cde_ctx;
 878         int ret;
 879
 880         cde_ctx = kzalloc(sizeof(*cde_ctx), GFP_KERNEL);
 881         if (!cde_ctx)
 882                 return ERR_PTR(-ENOMEM);
 883
 884         cde_ctx->g = g;
 885         cde_ctx->pdev = g->dev;
 886
 887         ret = gk20a_cde_load(cde_ctx);
 888         if (ret) {
 889                 kfree(cde_ctx);
 890                 return ERR_PTR(ret);
 891         }
 892
 893         INIT_LIST_HEAD(&cde_ctx->list);
 894         cde_ctx->is_temporary = false;
 895         cde_ctx->in_use = false;
 896         INIT_DELAYED_WORK(&cde_ctx->ctx_deleter_work,
 897                         gk20a_cde_ctx_deleter_fn);
 898
 899         gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx, "cde: allocated %p", cde_ctx);
 900         trace_gk20a_cde_allocate_context(cde_ctx);
 901         return cde_ctx;
 902 }
 903
 904 int gk20a_cde_convert(struct gk20a *g,
 905                       struct dma_buf *compbits_buf,
 906                       s32 compbits_kind, u64 compbits_byte_offset,
 907                       u32 compbits_size, struct nvgpu_fence *fence,
 908                       u32 __flags, struct gk20a_cde_param *params,
 909                       int num_params, struct gk20a_fence **fence_out)
 910 __acquires(&cde_app->mutex)
 911 __releases(&cde_app->mutex)
 912 {
 913         struct gk20a_cde_ctx *cde_ctx = NULL;
 914         struct gk20a_comptags comptags;
 915         u64 compbits_offset = 0;
 916         u64 map_vaddr = 0;
 917         u64 map_offset = 0;
 918         u32 map_size = 0;
 919         u64 big_page_mask = 0;
 920         u32 flags;
 921         int err, i;
 922
 923         mutex_lock(&g->cde_app.mutex);
 924
 925         cde_ctx = gk20a_cde_get_context(g);
 926         if (IS_ERR(cde_ctx)) {
 927                 err = PTR_ERR(cde_ctx);
 928                 goto exit_unlock;
 929         }
 930
 931         /* First, map the buffer to local va */
 932
 933         /* ensure that the compbits buffer has drvdata */
 934         err = gk20a_dmabuf_alloc_drvdata(compbits_buf, &g->dev->dev);
 935         if (err)
 936                 goto exit_unlock;
 937
 938         /* compbits don't start at page aligned offset, so we need to align
 939            the region to be mapped */
 940         big_page_mask = cde_ctx->vm->big_page_size - 1;
 941         map_offset = compbits_byte_offset & ~big_page_mask;
 942
 943         /* compute compbit start offset from the beginning of the mapped
 944            area */
 945         compbits_offset = compbits_byte_offset & big_page_mask;
 946
 947         if (!compbits_size) {
 948                 compbits_size = compbits_buf->size - compbits_byte_offset;
 949                 map_size = compbits_buf->size - map_offset;
 950         }
 951
 952         /* map the destination buffer */
 953         get_dma_buf(compbits_buf); /* a ref for gk20a_vm_map */
 954         map_vaddr = gk20a_vm_map(cde_ctx->vm, compbits_buf, 0,
 955                                  NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
 956                                  compbits_kind, NULL, true,
 957                                  gk20a_mem_flag_none,
 958                                  map_offset, map_size);
 959         if (!map_vaddr) {
 960                 dma_buf_put(compbits_buf);
 961                 err = -EINVAL;
 962                 goto exit_unlock;
 963         }
 964
 965         /* store source buffer compression tags */
 966         gk20a_get_comptags(&g->dev->dev, compbits_buf, &comptags);
 967         cde_ctx->surf_param_offset = comptags.offset;
 968         cde_ctx->surf_param_lines = comptags.lines;
 969
 970         /* store surface vaddr. This is actually compbit vaddr, but since
 971            compbits live in the same surface, and we can get the alloc base
 972            address by using gk20a_mm_gpuva_to_iova_base, this will do */
 973         cde_ctx->surf_vaddr = map_vaddr;
 974
 975         /* store information about destination */
 976         cde_ctx->compbit_vaddr = map_vaddr + compbits_offset;
 977         cde_ctx->compbit_size = compbits_size;
 978
 979         /* remove existing argument data */
 980         memset(cde_ctx->user_param_values, 0,
 981                sizeof(cde_ctx->user_param_values));
 982
 983         /* read user space arguments for the conversion */
 984         for (i = 0; i < num_params; i++) {
 985                 struct gk20a_cde_param *param = params + i;
 986                 int id = param->id - NUM_RESERVED_PARAMS;
 987
 988                 if (id < 0 || id >= MAX_CDE_USER_PARAMS) {
 989                         gk20a_warn(&cde_ctx->pdev->dev, "cde: unknown user parameter");
 990                         err = -EINVAL;
 991                         goto exit_unlock;
 992                 }
 993                 cde_ctx->user_param_values[id] = param->value;
 994         }
 995
 996         /* patch data */
 997         err = gk20a_cde_patch_params(cde_ctx);
 998         if (err) {
 999                 gk20a_warn(&cde_ctx->pdev->dev, "cde: failed to patch parameters");
1000                 goto exit_unlock;
1001         }
1002
1003         gk20a_dbg(gpu_dbg_cde, "cde: buffer=cbc, size=%zu, gpuva=%llx\n",
1004                  g->gr.compbit_store.size, cde_ctx->backing_store_vaddr);
1005         gk20a_dbg(gpu_dbg_cde, "cde: buffer=compbits, size=%llu, gpuva=%llx\n",
1006                  cde_ctx->compbit_size, cde_ctx->compbit_vaddr);
1007
1008
1009         /* take always the postfence as it is needed for protecting the
1010          * cde context */
1011         flags = __flags | NVGPU_SUBMIT_GPFIFO_FLAGS_FENCE_GET;
1012
1013         /* execute the conversion buffer, combined with init first if it's the
1014          * first time */
1015         err = gk20a_cde_execute_buffer(cde_ctx,
1016                         cde_ctx->init_cmd_executed
1017                                 ? TYPE_BUF_COMMAND_CONVERT
1018                                 : TYPE_BUF_COMMAND_INIT,
1019                         fence, flags, fence_out);
1020
1021         cde_ctx->init_cmd_executed = true;
1022
1023 exit_unlock:
1024
1025         /* unmap the buffers - channel holds references to them now */
1026         if (map_vaddr)
1027                 gk20a_vm_unmap(cde_ctx->vm, map_vaddr);
1028
1029         mutex_unlock(&g->cde_app.mutex);
1030         return err;
1031 }
1032
1033 static void gk20a_cde_finished_ctx_cb(struct channel_gk20a *ch, void *data)
1034 __acquires(&cde_app->mutex)
1035 __releases(&cde_app->mutex)
1036 {
1037         struct gk20a_cde_ctx *cde_ctx = data;
1038         struct gk20a *g = cde_ctx->g;
1039         struct gk20a_cde_app *cde_app = &g->cde_app;
1040         bool channel_idle;
1041
1042         mutex_lock(&ch->jobs_lock);
1043         channel_idle = list_empty(&ch->jobs);
1044         mutex_unlock(&ch->jobs_lock);
1045
1046         if (!channel_idle)
1047                 return;
1048
1049         trace_gk20a_cde_finished_ctx_cb(cde_ctx);
1050         gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx, "cde: finished %p", cde_ctx);
1051         if (!cde_ctx->in_use)
1052                 gk20a_dbg_info("double finish cde context %p on channel %p",
1053                                 cde_ctx, ch);
1054
1055         if (ch->has_timedout) {
1056                 if (cde_ctx->is_temporary) {
1057                         gk20a_warn(&cde_ctx->pdev->dev,
1058                                         "cde: channel had timed out"
1059                                         " (temporary channel)");
1060                         /* going to be deleted anyway */
1061                 } else {
1062                         gk20a_warn(&cde_ctx->pdev->dev,
1063                                         "cde: channel had timed out"
1064                                         ", reloading");
1065                         /* mark it to be deleted, replace with a new one */
1066                         mutex_lock(&cde_app->mutex);
1067                         cde_ctx->is_temporary = true;
1068                         if (gk20a_cde_create_context(g)) {
1069                                 gk20a_err(&cde_ctx->pdev->dev,
1070                                                 "cde: can't replace context");
1071                         }
1072                         mutex_unlock(&cde_app->mutex);
1073                 }
1074         }
1075
1076         /* delete temporary contexts later (watch for doubles) */
1077         if (cde_ctx->is_temporary && cde_ctx->in_use) {
1078                 WARN_ON(delayed_work_pending(&cde_ctx->ctx_deleter_work));
1079                 schedule_delayed_work(&cde_ctx->ctx_deleter_work,
1080                         msecs_to_jiffies(CTX_DELETE_TIME));
1081         }
1082
1083         if (!ch->has_timedout)
1084                 gk20a_cde_ctx_release(cde_ctx);
1085 }
1086
1087 static int gk20a_cde_load(struct gk20a_cde_ctx *cde_ctx)
1088 {
1089         struct gk20a *g = cde_ctx->g;
1090         const struct firmware *img;
1091         struct channel_gk20a *ch;
1092         struct gr_gk20a *gr = &g->gr;
1093         int err = 0;
1094         u64 vaddr;
1095
1096         img = gk20a_request_firmware(g, "gpu2cde.bin");
1097         if (!img) {
1098                 dev_err(&cde_ctx->pdev->dev, "cde: could not fetch the firmware");
1099                 return -ENOSYS;
1100         }
1101
1102         ch = gk20a_open_new_channel_with_cb(g, gk20a_cde_finished_ctx_cb,
1103                         cde_ctx);
1104         if (!ch) {
1105                 gk20a_warn(&cde_ctx->pdev->dev, "cde: gk20a channel not available");
1106                 err = -ENOMEM;
1107                 goto err_get_gk20a_channel;
1108         }
1109
1110         /* bind the channel to the vm */
1111         gk20a_vm_get(&g->mm.pmu.vm);
1112         ch->vm = &g->mm.pmu.vm;
1113         err = channel_gk20a_commit_va(ch);
1114         if (err) {
1115                 gk20a_warn(&cde_ctx->pdev->dev, "cde: could not bind vm");
1116                 goto err_commit_va;
1117         }
1118
1119         /* allocate gpfifo (1024 should be more than enough) */
1120         err = gk20a_alloc_channel_gpfifo(ch,
1121                 &(struct nvgpu_alloc_gpfifo_args){1024, 0});
1122         if (err) {
1123                 gk20a_warn(&cde_ctx->pdev->dev, "cde: unable to allocate gpfifo");
1124                 goto err_alloc_gpfifo;
1125         }
1126
1127         /* map backing store to gpu virtual space */
1128         vaddr = gk20a_gmmu_map(ch->vm, &gr->compbit_store.sgt,
1129                                g->gr.compbit_store.size,
1130                                NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
1131                                gk20a_mem_flag_read_only);
1132
1133         if (!vaddr) {
1134                 gk20a_warn(&cde_ctx->pdev->dev, "cde: cannot map compression bit backing store");
1135                 err = -ENOMEM;
1136                 goto err_map_backingstore;
1137         }
1138
1139         /* store initialisation data */
1140         cde_ctx->ch = ch;
1141         cde_ctx->vm = ch->vm;
1142         cde_ctx->backing_store_vaddr = vaddr;
1143
1144         /* initialise the firmware */
1145         err = gk20a_init_cde_img(cde_ctx, img);
1146         if (err) {
1147                 gk20a_warn(&cde_ctx->pdev->dev, "cde: image initialisation failed");
1148                 goto err_init_cde_img;
1149         }
1150
1151         /* initialisation done */
1152         release_firmware(img);
1153
1154         return 0;
1155
1156 err_init_cde_img:
1157         gk20a_gmmu_unmap(ch->vm, vaddr, g->gr.compbit_store.size, 1);
1158 err_map_backingstore:
1159 err_alloc_gpfifo:
1160         gk20a_vm_put(ch->vm);
1161 err_commit_va:
1162 err_get_gk20a_channel:
1163         release_firmware(img);
1164         dev_err(&cde_ctx->pdev->dev, "cde: couldn't initialise buffer converter: %d",
1165                 err);
1166         return err;
1167 }
1168
1169 int gk20a_cde_reload(struct gk20a *g)
1170 __acquires(&cde_app->mutex)
1171 __releases(&cde_app->mutex)
1172 {
1173         struct gk20a_cde_app *cde_app = &g->cde_app;
1174         int err;
1175
1176         if (!cde_app->initialised)
1177                 return -ENOSYS;
1178
1179         err = gk20a_busy(g->dev);
1180         if (err)
1181                 return err;
1182
1183         mutex_lock(&cde_app->mutex);
1184
1185         gk20a_cde_stop(g);
1186
1187         err = gk20a_cde_create_contexts(g);
1188         if (!err)
1189                 cde_app->initialised = true;
1190
1191         mutex_unlock(&cde_app->mutex);
1192
1193         gk20a_idle(g->dev);
1194         return err;
1195 }
1196
1197 int gk20a_init_cde_support(struct gk20a *g)
1198 __acquires(&cde_app->mutex)
1199 __releases(&cde_app->mutex)
1200 {
1201         struct gk20a_cde_app *cde_app = &g->cde_app;
1202         int err;
1203
1204         if (cde_app->initialised)
1205                 return 0;
1206
1207         gk20a_dbg(gpu_dbg_fn | gpu_dbg_cde_ctx, "cde: init");
1208
1209         mutex_init(&cde_app->mutex);
1210         mutex_lock(&cde_app->mutex);
1211
1212         INIT_LIST_HEAD(&cde_app->free_contexts);
1213         INIT_LIST_HEAD(&cde_app->used_contexts);
1214         cde_app->ctx_count = 0;
1215         cde_app->ctx_count_top = 0;
1216         cde_app->ctx_usecount = 0;
1217
1218         err = gk20a_cde_create_contexts(g);
1219         if (!err)
1220                 cde_app->initialised = true;
1221
1222         mutex_unlock(&cde_app->mutex);
1223         gk20a_dbg(gpu_dbg_cde_ctx, "cde: init finished: %d", err);
1224         return err;
1225 }
1226
1227 enum cde_launch_patch_id {
1228         PATCH_H_QMD_CTA_RASTER_WIDTH_ID     = 1024,
1229         PATCH_H_QMD_CTA_RASTER_HEIGHT_ID    = 1025,
1230         PATCH_QMD_CTA_RASTER_DEPTH_ID       = 1026, /* for firmware v0 only */
1231         PATCH_QMD_CTA_THREAD_DIMENSION0_ID  = 1027,
1232         PATCH_QMD_CTA_THREAD_DIMENSION1_ID  = 1028,
1233         PATCH_QMD_CTA_THREAD_DIMENSION2_ID  = 1029, /* for firmware v0 only */
1234         PATCH_USER_CONST_XTILES_ID          = 1030, /* for firmware v0 only */
1235         PATCH_USER_CONST_YTILES_ID          = 1031, /* for firmware v0 only */
1236         PATCH_USER_CONST_BLOCKHEIGHTLOG2_ID = 1032,
1237         PATCH_USER_CONST_DSTPITCH_ID        = 1033, /* for firmware v0 only */
1238         PATCH_H_USER_CONST_FLAGS_ID         = 1034, /* for firmware v0 only */
1239         PATCH_H_VPC_CURRENT_GRID_SIZE_X_ID  = 1035,
1240         PATCH_H_VPC_CURRENT_GRID_SIZE_Y_ID  = 1036,
1241         PATCH_H_VPC_CURRENT_GRID_SIZE_Z_ID  = 1037,
1242         PATCH_VPC_CURRENT_GROUP_SIZE_X_ID   = 1038,
1243         PATCH_VPC_CURRENT_GROUP_SIZE_Y_ID   = 1039,
1244         PATCH_VPC_CURRENT_GROUP_SIZE_Z_ID   = 1040,
1245         PATCH_USER_CONST_XBLOCKS_ID         = 1041,
1246         PATCH_H_USER_CONST_DSTOFFSET_ID     = 1042,
1247         PATCH_V_QMD_CTA_RASTER_WIDTH_ID     = 1043,
1248         PATCH_V_QMD_CTA_RASTER_HEIGHT_ID    = 1044,
1249         PATCH_V_USER_CONST_DSTOFFSET_ID     = 1045,
1250         PATCH_V_VPC_CURRENT_GRID_SIZE_X_ID  = 1046,
1251         PATCH_V_VPC_CURRENT_GRID_SIZE_Y_ID  = 1047,
1252         PATCH_V_VPC_CURRENT_GRID_SIZE_Z_ID  = 1048,
1253         PATCH_H_LAUNCH_WORD1_ID             = 1049,
1254         PATCH_H_LAUNCH_WORD2_ID             = 1050,
1255         PATCH_V_LAUNCH_WORD1_ID             = 1051,
1256         PATCH_V_LAUNCH_WORD2_ID             = 1052,
1257         PATCH_H_QMD_PROGRAM_OFFSET_ID       = 1053,
1258         PATCH_H_QMD_REGISTER_COUNT_ID       = 1054,
1259         PATCH_V_QMD_PROGRAM_OFFSET_ID       = 1055,
1260         PATCH_V_QMD_REGISTER_COUNT_ID       = 1056,
1261 };
1262
1263 enum programs {
1264         PROG_HPASS              = 0,
1265         PROG_VPASS_LARGE        = 1,
1266         PROG_VPASS_SMALL        = 2,
1267         PROG_HPASS_DEBUG        = 3,
1268         PROG_VPASS_LARGE_DEBUG  = 4,
1269         PROG_VPASS_SMALL_DEBUG  = 5,
1270         PROG_PASSTHROUGH        = 6,
1271         NUM_PROGRAMS            = 7
1272 };
1273
1274 /* maximum number of WRITE_PATCHes in the below function */
1275 #define MAX_CDE_LAUNCH_PATCHES            32
1276
1277 static int gk20a_buffer_convert_gpu_to_cde_v1(
1278                 struct gk20a *g,
1279                 struct dma_buf *dmabuf, u32 consumer,
1280                 u64 offset, u64 compbits_hoffset, u64 compbits_voffset,
1281                 u32 width, u32 height, u32 block_height_log2,
1282                 u32 submit_flags, struct nvgpu_fence *fence_in,
1283                 struct gk20a_buffer_state *state)
1284 {
1285         struct gk20a_cde_param params[MAX_CDE_LAUNCH_PATCHES];
1286         int param = 0;
1287         int err = 0;
1288         struct gk20a_fence *new_fence = NULL;
1289         const int wgx = 8;
1290         const int wgy = 8;
1291         const int compbits_per_byte = 4; /* one byte stores 4 compbit pairs */
1292         const int xalign = compbits_per_byte * wgx;
1293         const int yalign = wgy;
1294
1295         /* Compute per launch parameters */
1296         const int xtiles = (width + 7) >> 3;
1297         const int ytiles = (height + 7) >> 3;
1298         const int gridw_h = roundup(xtiles, xalign) / xalign;
1299         const int gridh_h = roundup(ytiles, yalign) / yalign;
1300         const int gridw_v = roundup(ytiles, xalign) / xalign;
1301         const int gridh_v = roundup(xtiles, yalign) / yalign;
1302         const int xblocks = (xtiles + 1) >> 1;
1303         const int voffset = compbits_voffset - compbits_hoffset;
1304
1305         int hprog = PROG_HPASS;
1306         int vprog = (block_height_log2 >= 2) ?
1307                 PROG_VPASS_LARGE : PROG_VPASS_SMALL;
1308         if (g->cde_app.shader_parameter == 1) {
1309                 hprog = PROG_PASSTHROUGH;
1310                 vprog = PROG_PASSTHROUGH;
1311         } else if (g->cde_app.shader_parameter == 2) {
1312                 hprog = PROG_HPASS_DEBUG;
1313                 vprog = (block_height_log2 >= 2) ?
1314                         PROG_VPASS_LARGE_DEBUG :
1315                         PROG_VPASS_SMALL_DEBUG;
1316         }
1317
1318         if (xtiles > 8192 / 8 || ytiles > 8192 / 8)
1319                 gk20a_warn(&g->dev->dev, "cde: surface is exceptionally large (xtiles=%d, ytiles=%d)",
1320                            xtiles, ytiles);
1321
1322         gk20a_dbg(gpu_dbg_cde, "w=%d, h=%d, bh_log2=%d, compbits_hoffset=0x%llx, compbits_voffset=0x%llx",
1323                   width, height, block_height_log2,
1324                   compbits_hoffset, compbits_voffset);
1325         gk20a_dbg(gpu_dbg_cde, "resolution (%d, %d) tiles (%d, %d)",
1326                   width, height, xtiles, ytiles);
1327         gk20a_dbg(gpu_dbg_cde, "group (%d, %d) gridH (%d, %d) gridV (%d, %d)",
1328                   wgx, wgy, gridw_h, gridh_h, gridw_v, gridh_v);
1329         gk20a_dbg(gpu_dbg_cde, "hprog=%d, offset=0x%x, regs=%d, vprog=%d, offset=0x%x, regs=%d",
1330                   hprog,
1331                   g->cde_app.arrays[ARRAY_PROGRAM_OFFSET][hprog],
1332                   g->cde_app.arrays[ARRAY_REGISTER_COUNT][hprog],
1333                   vprog,
1334                   g->cde_app.arrays[ARRAY_PROGRAM_OFFSET][vprog],
1335                   g->cde_app.arrays[ARRAY_REGISTER_COUNT][vprog]);
1336
1337         /* Write parameters */
1338 #define WRITE_PATCH(NAME, VALUE) \
1339         params[param++] = (struct gk20a_cde_param){NAME##_ID, 0, VALUE}
1340         WRITE_PATCH(PATCH_USER_CONST_XBLOCKS, xblocks);
1341         WRITE_PATCH(PATCH_USER_CONST_BLOCKHEIGHTLOG2,
1342                 block_height_log2);
1343         WRITE_PATCH(PATCH_QMD_CTA_THREAD_DIMENSION0, wgx);
1344         WRITE_PATCH(PATCH_QMD_CTA_THREAD_DIMENSION1, wgy);
1345         WRITE_PATCH(PATCH_VPC_CURRENT_GROUP_SIZE_X, wgx);
1346         WRITE_PATCH(PATCH_VPC_CURRENT_GROUP_SIZE_Y, wgy);
1347         WRITE_PATCH(PATCH_VPC_CURRENT_GROUP_SIZE_Z, 1);
1348
1349         WRITE_PATCH(PATCH_H_QMD_CTA_RASTER_WIDTH, gridw_h);
1350         WRITE_PATCH(PATCH_H_QMD_CTA_RASTER_HEIGHT, gridh_h);
1351         WRITE_PATCH(PATCH_H_USER_CONST_DSTOFFSET, 0);
1352         WRITE_PATCH(PATCH_H_VPC_CURRENT_GRID_SIZE_X, gridw_h);
1353         WRITE_PATCH(PATCH_H_VPC_CURRENT_GRID_SIZE_Y, gridh_h);
1354         WRITE_PATCH(PATCH_H_VPC_CURRENT_GRID_SIZE_Z, 1);
1355
1356         WRITE_PATCH(PATCH_V_QMD_CTA_RASTER_WIDTH, gridw_v);
1357         WRITE_PATCH(PATCH_V_QMD_CTA_RASTER_HEIGHT, gridh_v);
1358         WRITE_PATCH(PATCH_V_USER_CONST_DSTOFFSET, voffset);
1359         WRITE_PATCH(PATCH_V_VPC_CURRENT_GRID_SIZE_X, gridw_v);
1360         WRITE_PATCH(PATCH_V_VPC_CURRENT_GRID_SIZE_Y, gridh_v);
1361         WRITE_PATCH(PATCH_V_VPC_CURRENT_GRID_SIZE_Z, 1);
1362
1363         WRITE_PATCH(PATCH_H_QMD_PROGRAM_OFFSET,
1364                 g->cde_app.arrays[ARRAY_PROGRAM_OFFSET][hprog]);
1365         WRITE_PATCH(PATCH_H_QMD_REGISTER_COUNT,
1366                 g->cde_app.arrays[ARRAY_REGISTER_COUNT][hprog]);
1367         WRITE_PATCH(PATCH_V_QMD_PROGRAM_OFFSET,
1368                 g->cde_app.arrays[ARRAY_PROGRAM_OFFSET][vprog]);
1369         WRITE_PATCH(PATCH_V_QMD_REGISTER_COUNT,
1370                 g->cde_app.arrays[ARRAY_REGISTER_COUNT][vprog]);
1371
1372         if (consumer & NVGPU_GPU_COMPBITS_CDEH) {
1373                 WRITE_PATCH(PATCH_H_LAUNCH_WORD1,
1374                         g->cde_app.arrays[ARRAY_LAUNCH_COMMAND][0]);
1375                 WRITE_PATCH(PATCH_H_LAUNCH_WORD2,
1376                         g->cde_app.arrays[ARRAY_LAUNCH_COMMAND][1]);
1377         } else {
1378                 WRITE_PATCH(PATCH_H_LAUNCH_WORD1,
1379                         g->cde_app.arrays[ARRAY_LAUNCH_COMMAND][2]);
1380                 WRITE_PATCH(PATCH_H_LAUNCH_WORD2,
1381                         g->cde_app.arrays[ARRAY_LAUNCH_COMMAND][3]);
1382         }
1383
1384         if (consumer & NVGPU_GPU_COMPBITS_CDEV) {
1385                 WRITE_PATCH(PATCH_V_LAUNCH_WORD1,
1386                         g->cde_app.arrays[ARRAY_LAUNCH_COMMAND][0]);
1387                 WRITE_PATCH(PATCH_V_LAUNCH_WORD2,
1388                         g->cde_app.arrays[ARRAY_LAUNCH_COMMAND][1]);
1389         } else {
1390                 WRITE_PATCH(PATCH_V_LAUNCH_WORD1,
1391                         g->cde_app.arrays[ARRAY_LAUNCH_COMMAND][2]);
1392                 WRITE_PATCH(PATCH_V_LAUNCH_WORD2,
1393                         g->cde_app.arrays[ARRAY_LAUNCH_COMMAND][3]);
1394         }
1395 #undef WRITE_PATCH
1396
1397         err = gk20a_cde_convert(g, dmabuf,
1398                                 0, /* dst kind */
1399                                 compbits_hoffset,
1400                                 0, /* dst_size, 0 = auto */
1401                                 fence_in, submit_flags,
1402                                 params, param, &new_fence);
1403         if (err)
1404                 goto out;
1405
1406         /* compbits generated, update state & fence */
1407         gk20a_fence_put(state->fence);
1408         state->fence = new_fence;
1409         state->valid_compbits |= consumer &
1410                 (NVGPU_GPU_COMPBITS_CDEH | NVGPU_GPU_COMPBITS_CDEV);
1411 out:
1412         return err;
1413 }
1414
1415 static int gk20a_buffer_convert_gpu_to_cde(
1416                 struct gk20a *g, struct dma_buf *dmabuf, u32 consumer,
1417                 u64 offset, u64 compbits_hoffset, u64 compbits_voffset,
1418                 u32 width, u32 height, u32 block_height_log2,
1419                 u32 submit_flags, struct nvgpu_fence *fence_in,
1420                 struct gk20a_buffer_state *state)
1421 {
1422         int err = 0;
1423
1424         if (!g->cde_app.initialised)
1425                 return -ENOSYS;
1426
1427         err = gk20a_busy(g->dev);
1428         if (err)
1429                 return err;
1430
1431         gk20a_dbg(gpu_dbg_cde, "firmware version = %d\n",
1432                 g->cde_app.firmware_version);
1433
1434         if (g->cde_app.firmware_version == 1) {
1435                 err = gk20a_buffer_convert_gpu_to_cde_v1(
1436                     g, dmabuf, consumer, offset, compbits_hoffset,
1437                     compbits_voffset, width, height, block_height_log2,
1438                     submit_flags, fence_in, state);
1439         } else {
1440                 dev_err(dev_from_gk20a(g), "unsupported CDE firmware version %d",
1441                         g->cde_app.firmware_version);
1442                 err = -EINVAL;
1443         }
1444
1445         gk20a_idle(g->dev);
1446         return err;
1447 }
1448
1449 int gk20a_prepare_compressible_read(
1450                 struct gk20a *g, u32 buffer_fd, u32 request, u64 offset,
1451                 u64 compbits_hoffset, u64 compbits_voffset,
1452                 u32 width, u32 height, u32 block_height_log2,
1453                 u32 submit_flags, struct nvgpu_fence *fence,
1454                 u32 *valid_compbits, u32 *zbc_color,
1455                 struct gk20a_fence **fence_out)
1456 {
1457         int err = 0;
1458         struct gk20a_buffer_state *state;
1459         struct dma_buf *dmabuf;
1460         u32 missing_bits;
1461
1462         dmabuf = dma_buf_get(buffer_fd);
1463         if (IS_ERR(dmabuf))
1464                 return -EINVAL;
1465
1466         err = gk20a_dmabuf_get_state(dmabuf, dev_from_gk20a(g),
1467                                      offset, &state);
1468         if (err) {
1469                 dma_buf_put(dmabuf);
1470                 return err;
1471         }
1472
1473         missing_bits = (state->valid_compbits ^ request) & request;
1474
1475         mutex_lock(&state->lock);
1476
1477         if (state->valid_compbits && request == NVGPU_GPU_COMPBITS_NONE) {
1478
1479                 gk20a_fence_put(state->fence);
1480                 state->fence = NULL;
1481                 /* state->fence = decompress();
1482                 state->valid_compbits = 0; */
1483                 err = -EINVAL;
1484                 goto out;
1485         } else if (missing_bits) {
1486                 u32 missing_cde_bits = missing_bits &
1487                          (NVGPU_GPU_COMPBITS_CDEH | NVGPU_GPU_COMPBITS_CDEV);
1488                 if ((state->valid_compbits & NVGPU_GPU_COMPBITS_GPU) &&
1489                     missing_cde_bits) {
1490                         err = gk20a_buffer_convert_gpu_to_cde(
1491                                         g, dmabuf,
1492                                         missing_cde_bits,
1493                                         offset, compbits_hoffset,
1494                                         compbits_voffset,
1495                                         width, height, block_height_log2,
1496                                         submit_flags, fence,
1497                                         state);
1498                         if (err)
1499                                 goto out;
1500                 }
1501         }
1502
1503         if (state->fence && fence_out)
1504                 *fence_out = gk20a_fence_get(state->fence);
1505
1506         if (valid_compbits)
1507                 *valid_compbits = state->valid_compbits;
1508
1509         if (zbc_color)
1510                 *zbc_color = state->zbc_color;
1511
1512 out:
1513         mutex_unlock(&state->lock);
1514         dma_buf_put(dmabuf);
1515         return err;
1516 }
1517
1518 int gk20a_mark_compressible_write(struct gk20a *g, u32 buffer_fd,
1519                                   u32 valid_compbits, u64 offset, u32 zbc_color)
1520 {
1521         int err;
1522         struct gk20a_buffer_state *state;
1523         struct dma_buf *dmabuf;
1524
1525         dmabuf = dma_buf_get(buffer_fd);
1526         if (IS_ERR(dmabuf)) {
1527                 dev_err(dev_from_gk20a(g), "invalid dmabuf");
1528                 return -EINVAL;
1529         }
1530
1531         err = gk20a_dmabuf_get_state(dmabuf, dev_from_gk20a(g), offset, &state);
1532         if (err) {
1533                 dev_err(dev_from_gk20a(g), "could not get state from dmabuf");
1534                 dma_buf_put(dmabuf);
1535                 return err;
1536         }
1537
1538         mutex_lock(&state->lock);
1539
1540         /* Update the compbits state. */
1541         state->valid_compbits = valid_compbits;
1542         state->zbc_color = zbc_color;
1543
1544         /* Discard previous compbit job fence. */
1545         gk20a_fence_put(state->fence);
1546         state->fence = NULL;
1547
1548         mutex_unlock(&state->lock);
1549         dma_buf_put(dmabuf);
1550         return 0;
1551 }
1552
1553 static ssize_t gk20a_cde_reload_write(struct file *file,
1554         const char __user *userbuf, size_t count, loff_t *ppos)
1555 {
1556         struct gk20a *g = file->private_data;
1557         gk20a_cde_reload(g);
1558         return count;
1559 }
1560
1561 static const struct file_operations gk20a_cde_reload_fops = {
1562         .open           = simple_open,
1563         .write          = gk20a_cde_reload_write,
1564 };
1565
1566 void gk20a_cde_debugfs_init(struct platform_device *dev)
1567 {
1568         struct gk20a_platform *platform = platform_get_drvdata(dev);
1569         struct gk20a *g = get_gk20a(dev);
1570
1571         debugfs_create_u32("cde_parameter", S_IWUSR | S_IRUGO,
1572                            platform->debugfs, &g->cde_app.shader_parameter);
1573         debugfs_create_u32("cde_ctx_count", S_IWUSR | S_IRUGO,
1574                            platform->debugfs, &g->cde_app.ctx_count);
1575         debugfs_create_u32("cde_ctx_usecount", S_IWUSR | S_IRUGO,
1576                            platform->debugfs, &g->cde_app.ctx_usecount);
1577         debugfs_create_u32("cde_ctx_count_top", S_IWUSR | S_IRUGO,
1578                            platform->debugfs, &g->cde_app.ctx_count_top);
1579         debugfs_create_file("reload_cde_firmware", S_IWUSR, platform->debugfs,
1580                             g, &gk20a_cde_reload_fops);
1581 }