]> rtime.felk.cvut.cz Git - sojka/nv-tegra/linux-3.10.git/blob - drivers/gpu/nvgpu/gk20a/mm_gk20a.c
video: tegra: host: gk20a: fix PDE update sequence
[sojka/nv-tegra/linux-3.10.git] / drivers / gpu / nvgpu / gk20a / mm_gk20a.c
1 /*
2  * drivers/video/tegra/host/gk20a/mm_gk20a.c
3  *
4  * GK20A memory management
5  *
6  * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
7  *
8  * This program is free software; you can redistribute it and/or modify it
9  * under the terms and conditions of the GNU General Public License,
10  * version 2, as published by the Free Software Foundation.
11  *
12  * This program is distributed in the hope it will be useful, but WITHOUT
13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
15  * more details.
16  *
17  * You should have received a copy of the GNU General Public License along with
18  * this program; if not, write to the Free Software Foundation, Inc.,
19  * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
20  */
21
22 #include <linux/delay.h>
23 #include <linux/highmem.h>
24 #include <linux/log2.h>
25 #include <linux/nvhost.h>
26 #include <linux/pm_runtime.h>
27 #include <linux/scatterlist.h>
28 #include <linux/nvmap.h>
29 #include <linux/tegra-soc.h>
30 #include <linux/vmalloc.h>
31 #include <linux/dma-buf.h>
32 #include <asm/cacheflush.h>
33
34 #include "gk20a.h"
35 #include "mm_gk20a.h"
36 #include "hw_gmmu_gk20a.h"
37 #include "hw_fb_gk20a.h"
38 #include "hw_bus_gk20a.h"
39 #include "hw_ram_gk20a.h"
40 #include "hw_mc_gk20a.h"
41 #include "hw_flush_gk20a.h"
42 #include "hw_ltc_gk20a.h"
43
44 #include "kind_gk20a.h"
45
46 #ifdef CONFIG_ARM64
47 #define outer_flush_range(a, b)
48 #define __cpuc_flush_dcache_area __flush_dcache_area
49 #endif
50
51 /*
52  * GPU mapping life cycle
53  * ======================
54  *
55  * Kernel mappings
56  * ---------------
57  *
58  * Kernel mappings are created through vm.map(..., false):
59  *
60  *  - Mappings to the same allocations are reused and refcounted.
61  *  - This path does not support deferred unmapping (i.e. kernel must wait for
62  *    all hw operations on the buffer to complete before unmapping).
63  *  - References to dmabuf are owned and managed by the (kernel) clients of
64  *    the gk20a_vm layer.
65  *
66  *
67  * User space mappings
68  * -------------------
69  *
70  * User space mappings are created through as.map_buffer -> vm.map(..., true):
71  *
72  *  - Mappings to the same allocations are reused and refcounted.
73  *  - This path supports deferred unmapping (i.e. we delay the actual unmapping
74  *    until all hw operations have completed).
75  *  - References to dmabuf are owned and managed by the vm_gk20a
76  *    layer itself. vm.map acquires these refs, and sets
77  *    mapped_buffer->own_mem_ref to record that we must release the refs when we
78  *    actually unmap.
79  *
80  */
81
82 static inline int vm_aspace_id(struct vm_gk20a *vm)
83 {
84         /* -1 is bar1 or pmu, etc. */
85         return vm->as_share ? vm->as_share->id : -1;
86 }
87 static inline u32 hi32(u64 f)
88 {
89         return (u32)(f >> 32);
90 }
91 static inline u32 lo32(u64 f)
92 {
93         return (u32)(f & 0xffffffff);
94 }
95
96 #define FLUSH_CPU_DCACHE(va, pa, size)  \
97         do {    \
98                 __cpuc_flush_dcache_area((void *)(va), (size_t)(size)); \
99                 outer_flush_range(pa, pa + (size_t)(size));             \
100         } while (0)
101
102 static void gk20a_vm_unmap_locked(struct mapped_buffer_node *mapped_buffer);
103 static struct mapped_buffer_node *find_mapped_buffer_locked(
104                                         struct rb_root *root, u64 addr);
105 static struct mapped_buffer_node *find_mapped_buffer_reverse_locked(
106                                 struct rb_root *root, struct dma_buf *dmabuf,
107                                 u32 kind);
108 static int update_gmmu_ptes_locked(struct vm_gk20a *vm,
109                                    enum gmmu_pgsz_gk20a pgsz_idx,
110                                    struct sg_table *sgt, u64 buffer_offset,
111                                    u64 first_vaddr, u64 last_vaddr,
112                                    u8 kind_v, u32 ctag_offset, bool cacheable,
113                                    int rw_flag);
114 static void update_gmmu_pde_locked(struct vm_gk20a *vm, u32 i);
115 static void gk20a_vm_remove_support(struct vm_gk20a *vm);
116
117
118 /* note: keep the page sizes sorted lowest to highest here */
119 static const u32 gmmu_page_sizes[gmmu_nr_page_sizes] = { SZ_4K, SZ_128K };
120 static const u32 gmmu_page_shifts[gmmu_nr_page_sizes] = { 12, 17 };
121 static const u64 gmmu_page_offset_masks[gmmu_nr_page_sizes] = { 0xfffLL,
122                                                                 0x1ffffLL };
123 static const u64 gmmu_page_masks[gmmu_nr_page_sizes] = { ~0xfffLL, ~0x1ffffLL };
124
125 struct gk20a_comptags {
126         u32 offset;
127         u32 lines;
128 };
129
130 struct gk20a_dmabuf_priv {
131         struct mutex lock;
132
133         struct gk20a_allocator *comptag_allocator;
134         struct gk20a_comptags comptags;
135
136         struct dma_buf_attachment *attach;
137         struct sg_table *sgt;
138
139         int pin_count;
140 };
141
142 static void gk20a_mm_delete_priv(void *_priv)
143 {
144         struct gk20a_dmabuf_priv *priv = _priv;
145         if (!priv)
146                 return;
147
148         if (priv->comptags.lines) {
149                 BUG_ON(!priv->comptag_allocator);
150                 priv->comptag_allocator->free(priv->comptag_allocator,
151                                               priv->comptags.offset,
152                                               priv->comptags.lines);
153         }
154
155         kfree(priv);
156 }
157
158 struct sg_table *gk20a_mm_pin(struct device *dev, struct dma_buf *dmabuf)
159 {
160         struct gk20a_dmabuf_priv *priv;
161
162         priv = dma_buf_get_drvdata(dmabuf, dev);
163         if (WARN_ON(!priv))
164                 return ERR_PTR(-EINVAL);
165
166         mutex_lock(&priv->lock);
167
168         if (priv->pin_count == 0) {
169                 priv->attach = dma_buf_attach(dmabuf, dev);
170                 if (IS_ERR(priv->attach)) {
171                         mutex_unlock(&priv->lock);
172                         return (struct sg_table *)priv->attach;
173                 }
174
175                 priv->sgt = dma_buf_map_attachment(priv->attach,
176                                                    DMA_BIDIRECTIONAL);
177                 if (IS_ERR(priv->sgt)) {
178                         dma_buf_detach(dmabuf, priv->attach);
179                         mutex_unlock(&priv->lock);
180                         return priv->sgt;
181                 }
182         }
183
184         priv->pin_count++;
185         mutex_unlock(&priv->lock);
186         return priv->sgt;
187 }
188
189 void gk20a_mm_unpin(struct device *dev, struct dma_buf *dmabuf,
190                     struct sg_table *sgt)
191 {
192         struct gk20a_dmabuf_priv *priv = dma_buf_get_drvdata(dmabuf, dev);
193         dma_addr_t dma_addr;
194
195         if (IS_ERR(priv) || !priv)
196                 return;
197
198         mutex_lock(&priv->lock);
199         WARN_ON(priv->sgt != sgt);
200         priv->pin_count--;
201         WARN_ON(priv->pin_count < 0);
202         dma_addr = sg_dma_address(priv->sgt->sgl);
203         if (priv->pin_count == 0) {
204                 dma_buf_unmap_attachment(priv->attach, priv->sgt,
205                                          DMA_BIDIRECTIONAL);
206                 dma_buf_detach(dmabuf, priv->attach);
207         }
208         mutex_unlock(&priv->lock);
209 }
210
211
212 static void gk20a_get_comptags(struct device *dev,
213                                struct dma_buf *dmabuf,
214                                struct gk20a_comptags *comptags)
215 {
216         struct gk20a_dmabuf_priv *priv = dma_buf_get_drvdata(dmabuf, dev);
217
218         if (!comptags)
219                 return;
220
221         if (!priv) {
222                 comptags->lines = 0;
223                 comptags->offset = 0;
224                 return;
225         }
226
227         *comptags = priv->comptags;
228 }
229
230 static int gk20a_alloc_comptags(struct device *dev,
231                                 struct dma_buf *dmabuf,
232                                 struct gk20a_allocator *allocator,
233                                 int lines)
234 {
235         struct gk20a_dmabuf_priv *priv = dma_buf_get_drvdata(dmabuf, dev);
236         u32 offset = 0;
237         int err;
238
239         if (!priv)
240                 return -ENOSYS;
241
242         if (!lines)
243                 return -EINVAL;
244
245         /* store the allocator so we can use it when we free the ctags */
246         priv->comptag_allocator = allocator;
247         err = allocator->alloc(allocator, &offset, lines);
248         if (!err) {
249                 priv->comptags.lines = lines;
250                 priv->comptags.offset = offset;
251         }
252         return err;
253 }
254
255
256
257
258 static int gk20a_init_mm_reset_enable_hw(struct gk20a *g)
259 {
260         gk20a_dbg_fn("");
261         if (g->ops.fb.reset)
262                 g->ops.fb.reset(g);
263
264         if (g->ops.fb.init_fs_state)
265                 g->ops.fb.init_fs_state(g);
266
267         return 0;
268 }
269
270 void gk20a_remove_mm_support(struct mm_gk20a *mm)
271 {
272         struct gk20a *g = mm->g;
273         struct device *d = dev_from_gk20a(g);
274         struct vm_gk20a *vm = &mm->bar1.vm;
275         struct inst_desc *inst_block = &mm->bar1.inst_block;
276
277         gk20a_dbg_fn("");
278
279         if (inst_block->cpuva)
280                 dma_free_coherent(d, inst_block->size,
281                         inst_block->cpuva, inst_block->iova);
282         inst_block->cpuva = NULL;
283         inst_block->iova = 0;
284
285         gk20a_vm_remove_support(vm);
286 }
287
288 int gk20a_init_mm_setup_sw(struct gk20a *g)
289 {
290         struct mm_gk20a *mm = &g->mm;
291         int i;
292
293         gk20a_dbg_fn("");
294
295         if (mm->sw_ready) {
296                 gk20a_dbg_fn("skip init");
297                 return 0;
298         }
299
300         mm->g = g;
301         mutex_init(&mm->l2_op_lock);
302         mm->big_page_size = gmmu_page_sizes[gmmu_page_size_big];
303         mm->compression_page_size = gmmu_page_sizes[gmmu_page_size_big];
304         mm->pde_stride    = mm->big_page_size << 10;
305         mm->pde_stride_shift = ilog2(mm->pde_stride);
306         BUG_ON(mm->pde_stride_shift > 31); /* we have assumptions about this */
307
308         for (i = 0; i < ARRAY_SIZE(gmmu_page_sizes); i++) {
309
310                 u32 num_ptes, pte_space, num_pages;
311
312                 /* assuming "full" page tables */
313                 num_ptes = mm->pde_stride / gmmu_page_sizes[i];
314
315                 pte_space = num_ptes * gmmu_pte__size_v();
316                 /* allocate whole pages */
317                 pte_space = roundup(pte_space, PAGE_SIZE);
318
319                 num_pages = pte_space / PAGE_SIZE;
320                 /* make sure "order" is viable */
321                 BUG_ON(!is_power_of_2(num_pages));
322
323                 mm->page_table_sizing[i].num_ptes = num_ptes;
324                 mm->page_table_sizing[i].order = ilog2(num_pages);
325         }
326
327         /*TBD: make channel vm size configurable */
328         mm->channel.size = 1ULL << NV_GMMU_VA_RANGE;
329
330         gk20a_dbg_info("channel vm size: %dMB", (int)(mm->channel.size >> 20));
331
332         gk20a_dbg_info("small page-size (%dKB) pte array: %dKB",
333                         gmmu_page_sizes[gmmu_page_size_small] >> 10,
334                         (mm->page_table_sizing[gmmu_page_size_small].num_ptes *
335                          gmmu_pte__size_v()) >> 10);
336
337         gk20a_dbg_info("big page-size (%dKB) pte array: %dKB",
338                         gmmu_page_sizes[gmmu_page_size_big] >> 10,
339                         (mm->page_table_sizing[gmmu_page_size_big].num_ptes *
340                          gmmu_pte__size_v()) >> 10);
341
342
343         gk20a_init_bar1_vm(mm);
344
345         mm->remove_support = gk20a_remove_mm_support;
346         mm->sw_ready = true;
347
348         gk20a_dbg_fn("done");
349         return 0;
350 }
351
352 /* make sure gk20a_init_mm_support is called before */
353 static int gk20a_init_mm_setup_hw(struct gk20a *g)
354 {
355         struct mm_gk20a *mm = &g->mm;
356         struct inst_desc *inst_block = &mm->bar1.inst_block;
357         phys_addr_t inst_pa = inst_block->cpu_pa;
358
359         gk20a_dbg_fn("");
360
361         /* set large page size in fb
362          * note this is very early on, can we defer it ? */
363         {
364                 u32 fb_mmu_ctrl = gk20a_readl(g, fb_mmu_ctrl_r());
365
366                 if (gmmu_page_sizes[gmmu_page_size_big] == SZ_128K)
367                         fb_mmu_ctrl = (fb_mmu_ctrl &
368                                        ~fb_mmu_ctrl_vm_pg_size_f(~0x0)) |
369                                 fb_mmu_ctrl_vm_pg_size_128kb_f();
370                 else
371                         BUG_ON(1); /* no support/testing for larger ones yet */
372
373                 gk20a_writel(g, fb_mmu_ctrl_r(), fb_mmu_ctrl);
374         }
375
376         inst_pa = (u32)(inst_pa >> bar1_instance_block_shift_gk20a());
377         gk20a_dbg_info("bar1 inst block ptr: 0x%08x",  (u32)inst_pa);
378
379         gk20a_writel(g, bus_bar1_block_r(),
380                      bus_bar1_block_target_vid_mem_f() |
381                      bus_bar1_block_mode_virtual_f() |
382                      bus_bar1_block_ptr_f(inst_pa));
383         if (gk20a_mm_fb_flush(g) || gk20a_mm_fb_flush(g))
384                 return -EBUSY;
385
386         gk20a_dbg_fn("done");
387         return 0;
388 }
389
390 int gk20a_init_mm_support(struct gk20a *g)
391 {
392         u32 err;
393
394         err = gk20a_init_mm_reset_enable_hw(g);
395         if (err)
396                 return err;
397
398         err = gk20a_init_mm_setup_sw(g);
399         if (err)
400                 return err;
401
402         err = gk20a_init_mm_setup_hw(g);
403         if (err)
404                 return err;
405
406         return err;
407 }
408
409 #ifdef CONFIG_GK20A_PHYS_PAGE_TABLES
410 static int alloc_gmmu_pages(struct vm_gk20a *vm, u32 order,
411                             void **handle,
412                             struct sg_table **sgt,
413                             size_t *size)
414 {
415         u32 num_pages = 1 << order;
416         u32 len = num_pages * PAGE_SIZE;
417         int err;
418         struct page *pages;
419
420         gk20a_dbg_fn("");
421
422         pages = alloc_pages(GFP_KERNEL, order);
423         if (!pages) {
424                 gk20a_dbg(gpu_dbg_pte, "alloc_pages failed\n");
425                 goto err_out;
426         }
427         *sgt = kzalloc(sizeof(*sgt), GFP_KERNEL);
428         if (!sgt) {
429                 gk20a_dbg(gpu_dbg_pte, "cannot allocate sg table");
430                 goto err_alloced;
431         }
432         err = sg_alloc_table(*sgt, 1, GFP_KERNEL);
433         if (err) {
434                 gk20a_dbg(gpu_dbg_pte, "sg_alloc_table failed\n");
435                 goto err_sg_table;
436         }
437         sg_set_page((*sgt)->sgl, pages, len, 0);
438         *handle = page_address(pages);
439         memset(*handle, 0, len);
440         *size = len;
441         FLUSH_CPU_DCACHE(*handle, sg_phys((*sgt)->sgl), len);
442
443         return 0;
444
445 err_sg_table:
446         kfree(*sgt);
447 err_alloced:
448         __free_pages(pages, order);
449 err_out:
450         return -ENOMEM;
451 }
452
453 static void free_gmmu_pages(struct vm_gk20a *vm, void *handle,
454                             struct sg_table *sgt, u32 order,
455                             size_t size)
456 {
457         gk20a_dbg_fn("");
458         BUG_ON(sgt == NULL);
459         free_pages((unsigned long)handle, order);
460         sg_free_table(sgt);
461         kfree(sgt);
462 }
463
464 static int map_gmmu_pages(void *handle, struct sg_table *sgt,
465                           void **va, size_t size)
466 {
467         FLUSH_CPU_DCACHE(handle, sg_phys(sgt->sgl), sgt->sgl->length);
468         *va = handle;
469         return 0;
470 }
471
472 static void unmap_gmmu_pages(void *handle, struct sg_table *sgt, void *va)
473 {
474         FLUSH_CPU_DCACHE(handle, sg_phys(sgt->sgl), sgt->sgl->length);
475 }
476 #else
477
478 static int alloc_gmmu_pages(struct vm_gk20a *vm, u32 order,
479                             void **handle,
480                             struct sg_table **sgt,
481                             size_t *size)
482 {
483         struct device *d = dev_from_vm(vm);
484         u32 num_pages = 1 << order;
485         u32 len = num_pages * PAGE_SIZE;
486         dma_addr_t iova;
487         DEFINE_DMA_ATTRS(attrs);
488         struct page **pages;
489         void *cpuva;
490         int err = 0;
491
492         gk20a_dbg_fn("");
493
494         *size = len;
495
496         if (IS_ENABLED(CONFIG_ARM64)) {
497                 cpuva = dma_zalloc_coherent(d, len, &iova, GFP_KERNEL);
498                 if (!cpuva) {
499                         gk20a_err(d, "memory allocation failed\n");
500                         goto err_out;
501                 }
502
503                 err = gk20a_get_sgtable(d, sgt, cpuva, iova, len);
504                 if (err) {
505                         gk20a_err(d, "sgt allocation failed\n");
506                         goto err_free;
507                 }
508
509                 *handle = cpuva;
510         } else {
511                 dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
512                 pages = dma_alloc_attrs(d, len, &iova, GFP_KERNEL, &attrs);
513                 if (!pages) {
514                         gk20a_err(d, "memory allocation failed\n");
515                         goto err_out;
516                 }
517
518                 err = gk20a_get_sgtable_from_pages(d, sgt, pages,
519                                         iova, len);
520                 if (err) {
521                         gk20a_err(d, "sgt allocation failed\n");
522                         goto err_free;
523                 }
524
525                 *handle = (void *)pages;
526         }
527
528         return 0;
529
530 err_free:
531         if (IS_ENABLED(CONFIG_ARM64)) {
532                 dma_free_coherent(d, len, handle, iova);
533                 cpuva = NULL;
534         } else {
535                 dma_free_attrs(d, len, pages, iova, &attrs);
536                 pages = NULL;
537         }
538         iova = 0;
539 err_out:
540         return -ENOMEM;
541 }
542
543 static void free_gmmu_pages(struct vm_gk20a *vm, void *handle,
544                             struct sg_table *sgt, u32 order,
545                             size_t size)
546 {
547         struct device *d = dev_from_vm(vm);
548         u64 iova;
549         DEFINE_DMA_ATTRS(attrs);
550         struct page **pages;
551
552         gk20a_dbg_fn("");
553         BUG_ON(sgt == NULL);
554
555         iova = sg_dma_address(sgt->sgl);
556
557         gk20a_free_sgtable(&sgt);
558
559         if (IS_ENABLED(CONFIG_ARM64)) {
560                 dma_free_coherent(d, size, handle, iova);
561         } else {
562                 pages = (struct page **)handle;
563                 dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
564                 dma_free_attrs(d, size, pages, iova, &attrs);
565                 pages = NULL;
566         }
567
568         handle = NULL;
569         iova = 0;
570 }
571
572 static int map_gmmu_pages(void *handle, struct sg_table *sgt,
573                           void **kva, size_t size)
574 {
575         int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
576         struct page **pages;
577         gk20a_dbg_fn("");
578
579         if (IS_ENABLED(CONFIG_ARM64)) {
580                 *kva = handle;
581         } else {
582                 pages = (struct page **)handle;
583                 *kva = vmap(pages, count, 0, pgprot_dmacoherent(PAGE_KERNEL));
584                 if (!(*kva))
585                         return -ENOMEM;
586         }
587
588         return 0;
589 }
590
591 static void unmap_gmmu_pages(void *handle, struct sg_table *sgt, void *va)
592 {
593         gk20a_dbg_fn("");
594
595         if (!IS_ENABLED(CONFIG_ARM64))
596                 vunmap(va);
597         va = NULL;
598 }
599 #endif
600
601 /* allocate a phys contig region big enough for a full
602  * sized gmmu page table for the given gmmu_page_size.
603  * the whole range is zeroed so it's "invalid"/will fault
604  */
605
606 static int zalloc_gmmu_page_table_gk20a(struct vm_gk20a *vm,
607                                         enum gmmu_pgsz_gk20a gmmu_pgsz_idx,
608                                         struct page_table_gk20a *pte)
609 {
610         int err;
611         u32 pte_order;
612         void *handle = NULL;
613         struct sg_table *sgt;
614         size_t size;
615
616         gk20a_dbg_fn("");
617
618         /* allocate enough pages for the table */
619         pte_order = vm->mm->page_table_sizing[gmmu_pgsz_idx].order;
620
621         err = alloc_gmmu_pages(vm, pte_order, &handle, &sgt, &size);
622         if (err)
623                 return err;
624
625         gk20a_dbg(gpu_dbg_pte, "pte = 0x%p, addr=%08llx, size %d",
626                         pte, gk20a_mm_iova_addr(sgt->sgl), pte_order);
627
628         pte->ref = handle;
629         pte->sgt = sgt;
630         pte->size = size;
631
632         return 0;
633 }
634
635 /* given address range (inclusive) determine the pdes crossed */
636 static inline void pde_range_from_vaddr_range(struct vm_gk20a *vm,
637                                               u64 addr_lo, u64 addr_hi,
638                                               u32 *pde_lo, u32 *pde_hi)
639 {
640         *pde_lo = (u32)(addr_lo >> vm->mm->pde_stride_shift);
641         *pde_hi = (u32)(addr_hi >> vm->mm->pde_stride_shift);
642         gk20a_dbg(gpu_dbg_pte, "addr_lo=0x%llx addr_hi=0x%llx pde_ss=%d",
643                    addr_lo, addr_hi, vm->mm->pde_stride_shift);
644         gk20a_dbg(gpu_dbg_pte, "pde_lo=%d pde_hi=%d",
645                    *pde_lo, *pde_hi);
646 }
647
648 static inline u32 *pde_from_index(struct vm_gk20a *vm, u32 i)
649 {
650         return (u32 *) (((u8 *)vm->pdes.kv) + i*gmmu_pde__size_v());
651 }
652
653 static inline u32 pte_index_from_vaddr(struct vm_gk20a *vm,
654                                        u64 addr, enum gmmu_pgsz_gk20a pgsz_idx)
655 {
656         u32 ret;
657         /* mask off pde part */
658         addr = addr & ((((u64)1) << vm->mm->pde_stride_shift) - ((u64)1));
659         /* shift over to get pte index. note assumption that pte index
660          * doesn't leak over into the high 32b */
661         ret = (u32)(addr >> gmmu_page_shifts[pgsz_idx]);
662
663         gk20a_dbg(gpu_dbg_pte, "addr=0x%llx pte_i=0x%x", addr, ret);
664         return ret;
665 }
666
667 static inline void pte_space_page_offset_from_index(u32 i, u32 *pte_page,
668                                                     u32 *pte_offset)
669 {
670         /* ptes are 8B regardless of pagesize */
671         /* pte space pages are 4KB. so 512 ptes per 4KB page*/
672         *pte_page = i >> 9;
673
674         /* this offset is a pte offset, not a byte offset */
675         *pte_offset = i & ((1<<9)-1);
676
677         gk20a_dbg(gpu_dbg_pte, "i=0x%x pte_page=0x%x pte_offset=0x%x",
678                    i, *pte_page, *pte_offset);
679 }
680
681
682 /*
683  * given a pde index/page table number make sure it has
684  * backing store and if not go ahead allocate it and
685  * record it in the appropriate pde
686  */
687 static int validate_gmmu_page_table_gk20a_locked(struct vm_gk20a *vm,
688                                 u32 i, enum gmmu_pgsz_gk20a gmmu_pgsz_idx)
689 {
690         int err;
691         struct page_table_gk20a *pte =
692                 vm->pdes.ptes[gmmu_pgsz_idx] + i;
693
694         gk20a_dbg_fn("");
695
696         /* if it's already in place it's valid */
697         if (pte->ref)
698                 return 0;
699
700         gk20a_dbg(gpu_dbg_pte, "alloc %dKB ptes for pde %d",
701                    gmmu_page_sizes[gmmu_pgsz_idx]/1024, i);
702
703         err = zalloc_gmmu_page_table_gk20a(vm, gmmu_pgsz_idx, pte);
704         if (err)
705                 return err;
706
707         /* rewrite pde */
708         update_gmmu_pde_locked(vm, i);
709
710         return 0;
711 }
712
713 static struct vm_reserved_va_node *addr_to_reservation(struct vm_gk20a *vm,
714                                                        u64 addr)
715 {
716         struct vm_reserved_va_node *va_node;
717         list_for_each_entry(va_node, &vm->reserved_va_list, reserved_va_list)
718                 if (addr >= va_node->vaddr_start &&
719                     addr < (u64)va_node->vaddr_start + (u64)va_node->size)
720                         return va_node;
721
722         return NULL;
723 }
724
725 int gk20a_vm_get_buffers(struct vm_gk20a *vm,
726                          struct mapped_buffer_node ***mapped_buffers,
727                          int *num_buffers)
728 {
729         struct mapped_buffer_node *mapped_buffer;
730         struct mapped_buffer_node **buffer_list;
731         struct rb_node *node;
732         int i = 0;
733
734         mutex_lock(&vm->update_gmmu_lock);
735
736         buffer_list = kzalloc(sizeof(*buffer_list) *
737                               vm->num_user_mapped_buffers, GFP_KERNEL);
738         if (!buffer_list) {
739                 mutex_unlock(&vm->update_gmmu_lock);
740                 return -ENOMEM;
741         }
742
743         node = rb_first(&vm->mapped_buffers);
744         while (node) {
745                 mapped_buffer =
746                         container_of(node, struct mapped_buffer_node, node);
747                 if (mapped_buffer->user_mapped) {
748                         buffer_list[i] = mapped_buffer;
749                         kref_get(&mapped_buffer->ref);
750                         i++;
751                 }
752                 node = rb_next(&mapped_buffer->node);
753         }
754
755         BUG_ON(i != vm->num_user_mapped_buffers);
756
757         *num_buffers = vm->num_user_mapped_buffers;
758         *mapped_buffers = buffer_list;
759
760         mutex_unlock(&vm->update_gmmu_lock);
761
762         return 0;
763 }
764
765 static void gk20a_vm_unmap_locked_kref(struct kref *ref)
766 {
767         struct mapped_buffer_node *mapped_buffer =
768                 container_of(ref, struct mapped_buffer_node, ref);
769         gk20a_vm_unmap_locked(mapped_buffer);
770 }
771
772 void gk20a_vm_put_buffers(struct vm_gk20a *vm,
773                                  struct mapped_buffer_node **mapped_buffers,
774                                  int num_buffers)
775 {
776         int i;
777
778         mutex_lock(&vm->update_gmmu_lock);
779
780         for (i = 0; i < num_buffers; ++i)
781                 kref_put(&mapped_buffers[i]->ref,
782                          gk20a_vm_unmap_locked_kref);
783
784         mutex_unlock(&vm->update_gmmu_lock);
785
786         kfree(mapped_buffers);
787 }
788
789 static void gk20a_vm_unmap_user(struct vm_gk20a *vm, u64 offset)
790 {
791         struct device *d = dev_from_vm(vm);
792         int retries;
793         struct mapped_buffer_node *mapped_buffer;
794
795         mutex_lock(&vm->update_gmmu_lock);
796
797         mapped_buffer = find_mapped_buffer_locked(&vm->mapped_buffers, offset);
798         if (!mapped_buffer) {
799                 mutex_unlock(&vm->update_gmmu_lock);
800                 gk20a_err(d, "invalid addr to unmap 0x%llx", offset);
801                 return;
802         }
803
804         if (mapped_buffer->flags & NVHOST_AS_MAP_BUFFER_FLAGS_FIXED_OFFSET) {
805                 mutex_unlock(&vm->update_gmmu_lock);
806
807                 retries = 1000;
808                 while (retries) {
809                         if (atomic_read(&mapped_buffer->ref.refcount) == 1)
810                                 break;
811                         retries--;
812                         udelay(50);
813                 }
814                 if (!retries)
815                         gk20a_err(d, "sync-unmap failed on 0x%llx",
816                                                                 offset);
817                 mutex_lock(&vm->update_gmmu_lock);
818         }
819
820         mapped_buffer->user_mapped--;
821         if (mapped_buffer->user_mapped == 0)
822                 vm->num_user_mapped_buffers--;
823         kref_put(&mapped_buffer->ref, gk20a_vm_unmap_locked_kref);
824
825         mutex_unlock(&vm->update_gmmu_lock);
826 }
827
828 static u64 gk20a_vm_alloc_va(struct vm_gk20a *vm,
829                              u64 size,
830                              enum gmmu_pgsz_gk20a gmmu_pgsz_idx)
831
832 {
833         struct gk20a_allocator *vma = &vm->vma[gmmu_pgsz_idx];
834         int err;
835         u64 offset;
836         u32 start_page_nr = 0, num_pages;
837         u64 gmmu_page_size = gmmu_page_sizes[gmmu_pgsz_idx];
838
839         if (gmmu_pgsz_idx >= ARRAY_SIZE(gmmu_page_sizes)) {
840                 dev_warn(dev_from_vm(vm),
841                          "invalid page size requested in gk20a vm alloc");
842                 return -EINVAL;
843         }
844
845         if ((gmmu_pgsz_idx == gmmu_page_size_big) && !vm->big_pages) {
846                 dev_warn(dev_from_vm(vm),
847                          "unsupportd page size requested");
848                 return -EINVAL;
849
850         }
851
852         /* be certain we round up to gmmu_page_size if needed */
853         /* TBD: DIV_ROUND_UP -> undefined reference to __aeabi_uldivmod */
854         size = (size + ((u64)gmmu_page_size - 1)) & ~((u64)gmmu_page_size - 1);
855
856         gk20a_dbg_info("size=0x%llx @ pgsz=%dKB", size,
857                         gmmu_page_sizes[gmmu_pgsz_idx]>>10);
858
859         /* The vma allocator represents page accounting. */
860         num_pages = size >> gmmu_page_shifts[gmmu_pgsz_idx];
861
862         err = vma->alloc(vma, &start_page_nr, num_pages);
863
864         if (err) {
865                 gk20a_err(dev_from_vm(vm),
866                            "%s oom: sz=0x%llx", vma->name, size);
867                 return 0;
868         }
869
870         offset = (u64)start_page_nr << gmmu_page_shifts[gmmu_pgsz_idx];
871         gk20a_dbg_fn("%s found addr: 0x%llx", vma->name, offset);
872
873         return offset;
874 }
875
876 static int gk20a_vm_free_va(struct vm_gk20a *vm,
877                              u64 offset, u64 size,
878                              enum gmmu_pgsz_gk20a pgsz_idx)
879 {
880         struct gk20a_allocator *vma = &vm->vma[pgsz_idx];
881         u32 page_size = gmmu_page_sizes[pgsz_idx];
882         u32 page_shift = gmmu_page_shifts[pgsz_idx];
883         u32 start_page_nr, num_pages;
884         int err;
885
886         gk20a_dbg_info("%s free addr=0x%llx, size=0x%llx",
887                         vma->name, offset, size);
888
889         start_page_nr = (u32)(offset >> page_shift);
890         num_pages = (u32)((size + page_size - 1) >> page_shift);
891
892         err = vma->free(vma, start_page_nr, num_pages);
893         if (err) {
894                 gk20a_err(dev_from_vm(vm),
895                            "not found: offset=0x%llx, sz=0x%llx",
896                            offset, size);
897         }
898
899         return err;
900 }
901
902 static int insert_mapped_buffer(struct rb_root *root,
903                                 struct mapped_buffer_node *mapped_buffer)
904 {
905         struct rb_node **new_node = &(root->rb_node), *parent = NULL;
906
907         /* Figure out where to put new node */
908         while (*new_node) {
909                 struct mapped_buffer_node *cmp_with =
910                         container_of(*new_node, struct mapped_buffer_node,
911                                      node);
912
913                 parent = *new_node;
914
915                 if (cmp_with->addr > mapped_buffer->addr) /* u64 cmp */
916                         new_node = &((*new_node)->rb_left);
917                 else if (cmp_with->addr != mapped_buffer->addr) /* u64 cmp */
918                         new_node = &((*new_node)->rb_right);
919                 else
920                         return -EINVAL; /* no fair dup'ing */
921         }
922
923         /* Add new node and rebalance tree. */
924         rb_link_node(&mapped_buffer->node, parent, new_node);
925         rb_insert_color(&mapped_buffer->node, root);
926
927         return 0;
928 }
929
930 static struct mapped_buffer_node *find_mapped_buffer_reverse_locked(
931                                 struct rb_root *root, struct dma_buf *dmabuf,
932                                 u32 kind)
933 {
934         struct rb_node *node = rb_first(root);
935         while (node) {
936                 struct mapped_buffer_node *mapped_buffer =
937                         container_of(node, struct mapped_buffer_node, node);
938                 if (mapped_buffer->dmabuf == dmabuf &&
939                     kind == mapped_buffer->kind)
940                         return mapped_buffer;
941                 node = rb_next(&mapped_buffer->node);
942         }
943         return 0;
944 }
945
946 static struct mapped_buffer_node *find_mapped_buffer_locked(
947                                         struct rb_root *root, u64 addr)
948 {
949
950         struct rb_node *node = root->rb_node;
951         while (node) {
952                 struct mapped_buffer_node *mapped_buffer =
953                         container_of(node, struct mapped_buffer_node, node);
954                 if (mapped_buffer->addr > addr) /* u64 cmp */
955                         node = node->rb_left;
956                 else if (mapped_buffer->addr != addr) /* u64 cmp */
957                         node = node->rb_right;
958                 else
959                         return mapped_buffer;
960         }
961         return 0;
962 }
963
964 static struct mapped_buffer_node *find_mapped_buffer_range_locked(
965                                         struct rb_root *root, u64 addr)
966 {
967         struct rb_node *node = root->rb_node;
968         while (node) {
969                 struct mapped_buffer_node *m =
970                         container_of(node, struct mapped_buffer_node, node);
971                 if (m->addr <= addr && m->addr + m->size > addr)
972                         return m;
973                 else if (m->addr > addr) /* u64 cmp */
974                         node = node->rb_left;
975                 else
976                         node = node->rb_right;
977         }
978         return 0;
979 }
980
981 #define BFR_ATTRS (sizeof(nvmap_bfr_param)/sizeof(nvmap_bfr_param[0]))
982
983 struct buffer_attrs {
984         struct sg_table *sgt;
985         u64 size;
986         u64 align;
987         u32 ctag_offset;
988         u32 ctag_lines;
989         int pgsz_idx;
990         u8 kind_v;
991         u8 uc_kind_v;
992 };
993
994 static void gmmu_select_page_size(struct buffer_attrs *bfr)
995 {
996         int i;
997         /*  choose the biggest first (top->bottom) */
998         for (i = (gmmu_nr_page_sizes-1); i >= 0; i--)
999                 if (!(gmmu_page_offset_masks[i] & bfr->align)) {
1000                         /* would like to add this too but nvmap returns the
1001                          * original requested size not the allocated size.
1002                          * (!(gmmu_page_offset_masks[i] & bfr->size)) */
1003                         bfr->pgsz_idx = i;
1004                         break;
1005                 }
1006 }
1007
1008 static int setup_buffer_kind_and_compression(struct device *d,
1009                                              u32 flags,
1010                                              struct buffer_attrs *bfr,
1011                                              enum gmmu_pgsz_gk20a pgsz_idx)
1012 {
1013         bool kind_compressible;
1014
1015         if (unlikely(bfr->kind_v == gmmu_pte_kind_invalid_v()))
1016                 bfr->kind_v = gmmu_pte_kind_pitch_v();
1017
1018         if (unlikely(!gk20a_kind_is_supported(bfr->kind_v))) {
1019                 gk20a_err(d, "kind 0x%x not supported", bfr->kind_v);
1020                 return -EINVAL;
1021         }
1022
1023         bfr->uc_kind_v = gmmu_pte_kind_invalid_v();
1024         /* find a suitable uncompressed kind if it becomes necessary later */
1025         kind_compressible = gk20a_kind_is_compressible(bfr->kind_v);
1026         if (kind_compressible) {
1027                 bfr->uc_kind_v = gk20a_get_uncompressed_kind(bfr->kind_v);
1028                 if (unlikely(bfr->uc_kind_v == gmmu_pte_kind_invalid_v())) {
1029                         /* shouldn't happen, but it is worth cross-checking */
1030                         gk20a_err(d, "comptag kind 0x%x can't be"
1031                                    " downgraded to uncompressed kind",
1032                                    bfr->kind_v);
1033                         return -EINVAL;
1034                 }
1035         }
1036         /* comptags only supported for suitable kinds, 128KB pagesize */
1037         if (unlikely(kind_compressible &&
1038                      (gmmu_page_sizes[pgsz_idx] != 128*1024))) {
1039                 /*
1040                 gk20a_warn(d, "comptags specified"
1041                 " but pagesize being used doesn't support it");*/
1042                 /* it is safe to fall back to uncompressed as
1043                    functionality is not harmed */
1044                 bfr->kind_v = bfr->uc_kind_v;
1045                 kind_compressible = false;
1046         }
1047         if (kind_compressible)
1048                 bfr->ctag_lines = ALIGN(bfr->size, COMP_TAG_LINE_SIZE) >>
1049                         COMP_TAG_LINE_SIZE_SHIFT;
1050         else
1051                 bfr->ctag_lines = 0;
1052
1053         return 0;
1054 }
1055
1056 static int validate_fixed_buffer(struct vm_gk20a *vm,
1057                                  struct buffer_attrs *bfr,
1058                                  u64 map_offset, u64 map_size)
1059 {
1060         struct device *dev = dev_from_vm(vm);
1061         struct vm_reserved_va_node *va_node;
1062         struct mapped_buffer_node *buffer;
1063
1064         if (map_offset & gmmu_page_offset_masks[bfr->pgsz_idx]) {
1065                 gk20a_err(dev, "map offset must be buffer page size aligned 0x%llx",
1066                            map_offset);
1067                 return -EINVAL;
1068         }
1069
1070         /* find the space reservation */
1071         va_node = addr_to_reservation(vm, map_offset);
1072         if (!va_node) {
1073                 gk20a_warn(dev, "fixed offset mapping without space allocation");
1074                 return -EINVAL;
1075         }
1076
1077         /* check that this mappings does not collide with existing
1078          * mappings by checking the overlapping area between the current
1079          * buffer and all other mapped buffers */
1080
1081         list_for_each_entry(buffer,
1082                 &va_node->va_buffers_list, va_buffers_list) {
1083                 s64 begin = max(buffer->addr, map_offset);
1084                 s64 end = min(buffer->addr +
1085                         buffer->size, map_offset + map_size);
1086                 if (end - begin > 0) {
1087                         gk20a_warn(dev, "overlapping buffer map requested");
1088                         return -EINVAL;
1089                 }
1090         }
1091
1092         return 0;
1093 }
1094
1095 static u64 __locked_gmmu_map(struct vm_gk20a *vm,
1096                                 u64 map_offset,
1097                                 struct sg_table *sgt,
1098                                 u64 buffer_offset,
1099                                 u64 size,
1100                                 int pgsz_idx,
1101                                 u8 kind_v,
1102                                 u32 ctag_offset,
1103                                 u32 flags,
1104                                 int rw_flag)
1105 {
1106         int err = 0, i = 0;
1107         bool allocated = false;
1108         u32 pde_lo, pde_hi;
1109         struct device *d = dev_from_vm(vm);
1110
1111         /* Allocate (or validate when map_offset != 0) the virtual address. */
1112         if (!map_offset) {
1113                 map_offset = gk20a_vm_alloc_va(vm, size,
1114                                           pgsz_idx);
1115                 if (!map_offset) {
1116                         gk20a_err(d, "failed to allocate va space");
1117                         err = -ENOMEM;
1118                         goto fail_alloc;
1119                 }
1120                 allocated = true;
1121         }
1122
1123         pde_range_from_vaddr_range(vm,
1124                                    map_offset,
1125                                    map_offset + size - 1,
1126                                    &pde_lo, &pde_hi);
1127
1128         /* mark the addr range valid (but with 0 phys addr, which will fault) */
1129         for (i = pde_lo; i <= pde_hi; i++) {
1130                 err = validate_gmmu_page_table_gk20a_locked(vm, i,
1131                                                             pgsz_idx);
1132                 if (err) {
1133                         gk20a_err(d, "failed to validate page table %d: %d",
1134                                                            i, err);
1135                         goto fail_validate;
1136                 }
1137         }
1138
1139         err = update_gmmu_ptes_locked(vm, pgsz_idx,
1140                                       sgt,
1141                                       buffer_offset,
1142                                       map_offset, map_offset + size - 1,
1143                                       kind_v,
1144                                       ctag_offset,
1145                                       flags &
1146                                       NVHOST_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
1147                                       rw_flag);
1148         if (err) {
1149                 gk20a_err(d, "failed to update ptes on map");
1150                 goto fail_validate;
1151         }
1152
1153         return map_offset;
1154 fail_validate:
1155         if (allocated)
1156                 gk20a_vm_free_va(vm, map_offset, size, pgsz_idx);
1157 fail_alloc:
1158         gk20a_err(d, "%s: failed with err=%d\n", __func__, err);
1159         return 0;
1160 }
1161
1162 static void __locked_gmmu_unmap(struct vm_gk20a *vm,
1163                                 u64 vaddr,
1164                                 u64 size,
1165                                 int pgsz_idx,
1166                                 bool va_allocated,
1167                                 int rw_flag)
1168 {
1169         int err = 0;
1170         struct gk20a *g = gk20a_from_vm(vm);
1171
1172         if (va_allocated) {
1173                 err = gk20a_vm_free_va(vm, vaddr, size, pgsz_idx);
1174                 if (err) {
1175                         dev_err(dev_from_vm(vm),
1176                                 "failed to free va");
1177                         return;
1178                 }
1179         }
1180
1181         /* unmap here needs to know the page size we assigned at mapping */
1182         err = update_gmmu_ptes_locked(vm,
1183                                 pgsz_idx,
1184                                 0, /* n/a for unmap */
1185                                 0,
1186                                 vaddr,
1187                                 vaddr + size - 1,
1188                                 0, 0, false /* n/a for unmap */,
1189                                 rw_flag);
1190         if (err)
1191                 dev_err(dev_from_vm(vm),
1192                         "failed to update gmmu ptes on unmap");
1193
1194         /* detect which if any pdes/ptes can now be released */
1195
1196         /* flush l2 so any dirty lines are written out *now*.
1197          *  also as we could potentially be switching this buffer
1198          * from nonvolatile (l2 cacheable) to volatile (l2 non-cacheable) at
1199          * some point in the future we need to invalidate l2.  e.g. switching
1200          * from a render buffer unmap (here) to later using the same memory
1201          * for gmmu ptes.  note the positioning of this relative to any smmu
1202          * unmapping (below). */
1203
1204         gk20a_mm_l2_flush(g, true);
1205 }
1206
1207 static u64 gk20a_vm_map_duplicate_locked(struct vm_gk20a *vm,
1208                                          struct dma_buf *dmabuf,
1209                                          u64 offset_align,
1210                                          u32 flags,
1211                                          int kind,
1212                                          struct sg_table **sgt,
1213                                          bool user_mapped,
1214                                          int rw_flag)
1215 {
1216         struct mapped_buffer_node *mapped_buffer = 0;
1217
1218         mapped_buffer =
1219                 find_mapped_buffer_reverse_locked(&vm->mapped_buffers,
1220                                                   dmabuf, kind);
1221         if (!mapped_buffer)
1222                 return 0;
1223
1224         if (mapped_buffer->flags != flags)
1225                 return 0;
1226
1227         if (flags & NVHOST_AS_MAP_BUFFER_FLAGS_FIXED_OFFSET &&
1228             mapped_buffer->addr != offset_align)
1229                 return 0;
1230
1231         BUG_ON(mapped_buffer->vm != vm);
1232
1233         /* mark the buffer as used */
1234         if (user_mapped) {
1235                 if (mapped_buffer->user_mapped == 0)
1236                         vm->num_user_mapped_buffers++;
1237                 mapped_buffer->user_mapped++;
1238
1239                 /* If the mapping comes from user space, we own
1240                  * the handle ref. Since we reuse an
1241                  * existing mapping here, we need to give back those
1242                  * refs once in order not to leak.
1243                  */
1244                 if (mapped_buffer->own_mem_ref)
1245                         dma_buf_put(mapped_buffer->dmabuf);
1246                 else
1247                         mapped_buffer->own_mem_ref = true;
1248         }
1249         kref_get(&mapped_buffer->ref);
1250
1251         gk20a_dbg(gpu_dbg_map,
1252                    "reusing as=%d pgsz=%d flags=0x%x ctags=%d "
1253                    "start=%d gv=0x%x,%08x -> 0x%x,%08x -> 0x%x,%08x "
1254                    "own_mem_ref=%d user_mapped=%d",
1255                    vm_aspace_id(vm), mapped_buffer->pgsz_idx,
1256                    mapped_buffer->flags,
1257                    mapped_buffer->ctag_lines,
1258                    mapped_buffer->ctag_offset,
1259                    hi32(mapped_buffer->addr), lo32(mapped_buffer->addr),
1260                    hi32((u64)sg_dma_address(mapped_buffer->sgt->sgl)),
1261                    lo32((u64)sg_dma_address(mapped_buffer->sgt->sgl)),
1262                    hi32((u64)sg_phys(mapped_buffer->sgt->sgl)),
1263                    lo32((u64)sg_phys(mapped_buffer->sgt->sgl)),
1264                    mapped_buffer->own_mem_ref, user_mapped);
1265
1266         if (sgt)
1267                 *sgt = mapped_buffer->sgt;
1268         return mapped_buffer->addr;
1269 }
1270
1271 u64 gk20a_vm_map(struct vm_gk20a *vm,
1272                         struct dma_buf *dmabuf,
1273                         u64 offset_align,
1274                         u32 flags /*NVHOST_AS_MAP_BUFFER_FLAGS_*/,
1275                         int kind,
1276                         struct sg_table **sgt,
1277                         bool user_mapped,
1278                         int rw_flag,
1279                         u64 buffer_offset,
1280                         u64 mapping_size)
1281 {
1282         struct gk20a *g = gk20a_from_vm(vm);
1283         struct gk20a_allocator *ctag_allocator = &g->gr.comp_tags;
1284         struct device *d = dev_from_vm(vm);
1285         struct mapped_buffer_node *mapped_buffer = 0;
1286         bool inserted = false, va_allocated = false;
1287         u32 gmmu_page_size = 0;
1288         u64 map_offset = 0;
1289         int err = 0;
1290         struct buffer_attrs bfr = {0};
1291         struct gk20a_comptags comptags;
1292         u64 buf_addr;
1293
1294         mutex_lock(&vm->update_gmmu_lock);
1295
1296         /* check if this buffer is already mapped */
1297         map_offset = gk20a_vm_map_duplicate_locked(vm, dmabuf, offset_align,
1298                                                    flags, kind, sgt,
1299                                                    user_mapped, rw_flag);
1300         if (map_offset) {
1301                 mutex_unlock(&vm->update_gmmu_lock);
1302                 return map_offset;
1303         }
1304
1305         /* pin buffer to get phys/iovmm addr */
1306         bfr.sgt = gk20a_mm_pin(d, dmabuf);
1307         if (IS_ERR(bfr.sgt)) {
1308                 /* Falling back to physical is actually possible
1309                  * here in many cases if we use 4K phys pages in the
1310                  * gmmu.  However we have some regions which require
1311                  * contig regions to work properly (either phys-contig
1312                  * or contig through smmu io_vaspace).  Until we can
1313                  * track the difference between those two cases we have
1314                  * to fail the mapping when we run out of SMMU space.
1315                  */
1316                 gk20a_warn(d, "oom allocating tracking buffer");
1317                 goto clean_up;
1318         }
1319
1320         if (sgt)
1321                 *sgt = bfr.sgt;
1322
1323         bfr.kind_v = kind;
1324         bfr.size = dmabuf->size;
1325         buf_addr = (u64)sg_dma_address(bfr.sgt->sgl);
1326         if (unlikely(!buf_addr))
1327                 buf_addr = (u64)sg_phys(bfr.sgt->sgl);
1328         bfr.align = 1 << __ffs(buf_addr);
1329         bfr.pgsz_idx = -1;
1330         mapping_size = mapping_size ? mapping_size : bfr.size;
1331
1332         /* If FIX_OFFSET is set, pgsz is determined. Otherwise, select
1333          * page size according to memory alignment */
1334         if (flags & NVHOST_AS_MAP_BUFFER_FLAGS_FIXED_OFFSET) {
1335                 bfr.pgsz_idx = NV_GMMU_VA_IS_UPPER(offset_align) ?
1336                                 gmmu_page_size_big : gmmu_page_size_small;
1337         } else {
1338                 if (vm->big_pages)
1339                         gmmu_select_page_size(&bfr);
1340                 else
1341                         bfr.pgsz_idx = gmmu_page_size_small;
1342         }
1343
1344         /* validate/adjust bfr attributes */
1345         if (unlikely(bfr.pgsz_idx == -1)) {
1346                 gk20a_err(d, "unsupported page size detected");
1347                 goto clean_up;
1348         }
1349
1350         if (unlikely(bfr.pgsz_idx < gmmu_page_size_small ||
1351                      bfr.pgsz_idx > gmmu_page_size_big)) {
1352                 BUG_ON(1);
1353                 err = -EINVAL;
1354                 goto clean_up;
1355         }
1356         gmmu_page_size = gmmu_page_sizes[bfr.pgsz_idx];
1357
1358         /* Check if we should use a fixed offset for mapping this buffer */
1359
1360         if (flags & NVHOST_AS_MAP_BUFFER_FLAGS_FIXED_OFFSET)  {
1361                 err = validate_fixed_buffer(vm, &bfr,
1362                         offset_align, mapping_size);
1363                 if (err)
1364                         goto clean_up;
1365
1366                 map_offset = offset_align;
1367                 va_allocated = false;
1368         } else
1369                 va_allocated = true;
1370
1371         if (sgt)
1372                 *sgt = bfr.sgt;
1373
1374         err = setup_buffer_kind_and_compression(d, flags, &bfr, bfr.pgsz_idx);
1375         if (unlikely(err)) {
1376                 gk20a_err(d, "failure setting up kind and compression");
1377                 goto clean_up;
1378         }
1379
1380         /* bar1 and pmu vm don't need ctag */
1381         if (!vm->enable_ctag)
1382                 bfr.ctag_lines = 0;
1383
1384         gk20a_get_comptags(d, dmabuf, &comptags);
1385
1386         if (bfr.ctag_lines && !comptags.lines) {
1387                 /* allocate compression resources if needed */
1388                 err = gk20a_alloc_comptags(d, dmabuf, ctag_allocator,
1389                                            bfr.ctag_lines);
1390                 if (err) {
1391                         /* ok to fall back here if we ran out */
1392                         /* TBD: we can partially alloc ctags as well... */
1393                         bfr.ctag_lines = bfr.ctag_offset = 0;
1394                         bfr.kind_v = bfr.uc_kind_v;
1395                 } else {
1396                         gk20a_get_comptags(d, dmabuf, &comptags);
1397
1398                         /* init/clear the ctag buffer */
1399                         g->ops.ltc.cbc_ctrl(g, gk20a_cbc_op_clear,
1400                                           comptags.offset,
1401                                           comptags.offset + comptags.lines - 1);
1402                 }
1403         }
1404
1405         /* store the comptag info */
1406         bfr.ctag_offset = comptags.offset;
1407
1408         /* update gmmu ptes */
1409         map_offset = __locked_gmmu_map(vm, map_offset,
1410                                         bfr.sgt,
1411                                         buffer_offset, /* sg offset */
1412                                         mapping_size,
1413                                         bfr.pgsz_idx,
1414                                         bfr.kind_v,
1415                                         bfr.ctag_offset,
1416                                         flags, rw_flag);
1417
1418         if (!map_offset)
1419                 goto clean_up;
1420
1421         gk20a_dbg(gpu_dbg_map,
1422            "as=%d pgsz=%d "
1423            "kind=0x%x kind_uc=0x%x flags=0x%x "
1424            "ctags=%d start=%d gv=0x%x,%08x -> 0x%x,%08x -> 0x%x,%08x",
1425            vm_aspace_id(vm), gmmu_page_size,
1426            bfr.kind_v, bfr.uc_kind_v, flags,
1427            bfr.ctag_lines, bfr.ctag_offset,
1428            hi32(map_offset), lo32(map_offset),
1429            hi32((u64)sg_dma_address(bfr.sgt->sgl)),
1430            lo32((u64)sg_dma_address(bfr.sgt->sgl)),
1431            hi32((u64)sg_phys(bfr.sgt->sgl)),
1432            lo32((u64)sg_phys(bfr.sgt->sgl)));
1433
1434 #if defined(NVHOST_DEBUG)
1435         {
1436                 int i;
1437                 struct scatterlist *sg = NULL;
1438                 gk20a_dbg(gpu_dbg_pte, "for_each_sg(bfr.sgt->sgl, sg, bfr.sgt->nents, i)");
1439                 for_each_sg(bfr.sgt->sgl, sg, bfr.sgt->nents, i ) {
1440                         u64 da = sg_dma_address(sg);
1441                         u64 pa = sg_phys(sg);
1442                         u64 len = sg->length;
1443                         gk20a_dbg(gpu_dbg_pte, "i=%d pa=0x%x,%08x da=0x%x,%08x len=0x%x,%08x",
1444                                    i, hi32(pa), lo32(pa), hi32(da), lo32(da),
1445                                    hi32(len), lo32(len));
1446                 }
1447         }
1448 #endif
1449
1450         /* keep track of the buffer for unmapping */
1451         /* TBD: check for multiple mapping of same buffer */
1452         mapped_buffer = kzalloc(sizeof(*mapped_buffer), GFP_KERNEL);
1453         if (!mapped_buffer) {
1454                 gk20a_warn(d, "oom allocating tracking buffer");
1455                 goto clean_up;
1456         }
1457         mapped_buffer->dmabuf      = dmabuf;
1458         mapped_buffer->sgt         = bfr.sgt;
1459         mapped_buffer->addr        = map_offset;
1460         mapped_buffer->size        = mapping_size;
1461         mapped_buffer->pgsz_idx    = bfr.pgsz_idx;
1462         mapped_buffer->ctag_offset = bfr.ctag_offset;
1463         mapped_buffer->ctag_lines  = bfr.ctag_lines;
1464         mapped_buffer->vm          = vm;
1465         mapped_buffer->flags       = flags;
1466         mapped_buffer->kind        = kind;
1467         mapped_buffer->va_allocated = va_allocated;
1468         mapped_buffer->user_mapped = user_mapped ? 1 : 0;
1469         mapped_buffer->own_mem_ref = user_mapped;
1470         INIT_LIST_HEAD(&mapped_buffer->unmap_list);
1471         INIT_LIST_HEAD(&mapped_buffer->va_buffers_list);
1472         kref_init(&mapped_buffer->ref);
1473
1474         err = insert_mapped_buffer(&vm->mapped_buffers, mapped_buffer);
1475         if (err) {
1476                 gk20a_err(d, "failed to insert into mapped buffer tree");
1477                 goto clean_up;
1478         }
1479         inserted = true;
1480         if (user_mapped)
1481                 vm->num_user_mapped_buffers++;
1482
1483         gk20a_dbg_info("allocated va @ 0x%llx", map_offset);
1484
1485         if (!va_allocated) {
1486                 struct vm_reserved_va_node *va_node;
1487
1488                 /* find the space reservation */
1489                 va_node = addr_to_reservation(vm, map_offset);
1490                 list_add_tail(&mapped_buffer->va_buffers_list,
1491                               &va_node->va_buffers_list);
1492                 mapped_buffer->va_node = va_node;
1493         }
1494
1495         mutex_unlock(&vm->update_gmmu_lock);
1496
1497         /* Invalidate kernel mappings immediately */
1498         if (vm_aspace_id(vm) == -1)
1499                 gk20a_mm_tlb_invalidate(vm);
1500
1501         return map_offset;
1502
1503 clean_up:
1504         if (inserted) {
1505                 rb_erase(&mapped_buffer->node, &vm->mapped_buffers);
1506                 if (user_mapped)
1507                         vm->num_user_mapped_buffers--;
1508         }
1509         kfree(mapped_buffer);
1510         if (va_allocated)
1511                 gk20a_vm_free_va(vm, map_offset, bfr.size, bfr.pgsz_idx);
1512         if (!IS_ERR(bfr.sgt))
1513                 gk20a_mm_unpin(d, dmabuf, bfr.sgt);
1514
1515         mutex_unlock(&vm->update_gmmu_lock);
1516         gk20a_dbg_info("err=%d\n", err);
1517         return 0;
1518 }
1519
1520 u64 gk20a_gmmu_map(struct vm_gk20a *vm,
1521                 struct sg_table **sgt,
1522                 u64 size,
1523                 u32 flags,
1524                 int rw_flag)
1525 {
1526         u64 vaddr;
1527
1528         mutex_lock(&vm->update_gmmu_lock);
1529         vaddr = __locked_gmmu_map(vm, 0, /* already mapped? - No */
1530                                 *sgt, /* sg table */
1531                                 0, /* sg offset */
1532                                 size,
1533                                 0, /* page size index = 0 i.e. SZ_4K */
1534                                 0, /* kind */
1535                                 0, /* ctag_offset */
1536                                 flags, rw_flag);
1537         mutex_unlock(&vm->update_gmmu_lock);
1538         if (!vaddr) {
1539                 gk20a_err(dev_from_vm(vm), "failed to allocate va space");
1540                 return 0;
1541         }
1542
1543         /* Invalidate kernel mappings immediately */
1544         gk20a_mm_tlb_invalidate(vm);
1545
1546         return vaddr;
1547 }
1548
1549 void gk20a_gmmu_unmap(struct vm_gk20a *vm,
1550                 u64 vaddr,
1551                 u64 size,
1552                 int rw_flag)
1553 {
1554         mutex_lock(&vm->update_gmmu_lock);
1555         __locked_gmmu_unmap(vm,
1556                         vaddr,
1557                         size,
1558                         0, /* page size 4K */
1559                         true, /*va_allocated */
1560                         rw_flag);
1561         mutex_unlock(&vm->update_gmmu_lock);
1562 }
1563
1564 phys_addr_t gk20a_get_phys_from_iova(struct device *d,
1565                                 u64 dma_addr)
1566 {
1567         phys_addr_t phys;
1568         u64 iova;
1569
1570         struct dma_iommu_mapping *mapping = to_dma_iommu_mapping(d);
1571         if (!mapping)
1572                 return dma_addr;
1573
1574         iova = dma_addr & PAGE_MASK;
1575         phys = iommu_iova_to_phys(mapping->domain, iova);
1576         return phys;
1577 }
1578
1579 /* get sg_table from already allocated buffer */
1580 int gk20a_get_sgtable(struct device *d, struct sg_table **sgt,
1581                         void *cpuva, u64 iova,
1582                         size_t size)
1583 {
1584         int err = 0;
1585         *sgt = kzalloc(sizeof(struct sg_table), GFP_KERNEL);
1586         if (!(*sgt)) {
1587                 dev_err(d, "failed to allocate memory\n");
1588                 err = -ENOMEM;
1589                 goto fail;
1590         }
1591         err = dma_get_sgtable(d, *sgt,
1592                         cpuva, iova,
1593                         size);
1594         if (err) {
1595                 dev_err(d, "failed to create sg table\n");
1596                 goto fail;
1597         }
1598         sg_dma_address((*sgt)->sgl) = iova;
1599
1600         return 0;
1601  fail:
1602         if (*sgt) {
1603                 kfree(*sgt);
1604                 *sgt = NULL;
1605         }
1606         return err;
1607 }
1608
1609 int gk20a_get_sgtable_from_pages(struct device *d, struct sg_table **sgt,
1610                         struct page **pages, u64 iova,
1611                         size_t size)
1612 {
1613         int err = 0;
1614         *sgt = kzalloc(sizeof(struct sg_table), GFP_KERNEL);
1615         if (!(*sgt)) {
1616                 dev_err(d, "failed to allocate memory\n");
1617                 err = -ENOMEM;
1618                 goto fail;
1619         }
1620         err = sg_alloc_table(*sgt, 1, GFP_KERNEL);
1621         if (err) {
1622                 dev_err(d, "failed to allocate sg_table\n");
1623                 goto fail;
1624         }
1625         sg_set_page((*sgt)->sgl, *pages, size, 0);
1626         sg_dma_address((*sgt)->sgl) = iova;
1627
1628         return 0;
1629  fail:
1630         if (*sgt) {
1631                 kfree(*sgt);
1632                 *sgt = NULL;
1633         }
1634         return err;
1635 }
1636
1637 void gk20a_free_sgtable(struct sg_table **sgt)
1638 {
1639         sg_free_table(*sgt);
1640         kfree(*sgt);
1641         *sgt = NULL;
1642 }
1643
1644 u64 gk20a_mm_iova_addr(struct scatterlist *sgl)
1645 {
1646         u64 result = sg_phys(sgl);
1647 #ifdef CONFIG_TEGRA_IOMMU_SMMU
1648         if (sg_dma_address(sgl) == DMA_ERROR_CODE)
1649                 result = 0;
1650         else if (sg_dma_address(sgl)) {
1651                 result = sg_dma_address(sgl) |
1652                         1ULL << NV_MC_SMMU_VADDR_TRANSLATION_BIT;
1653         }
1654 #endif
1655         return result;
1656 }
1657
1658 static int update_gmmu_ptes_locked(struct vm_gk20a *vm,
1659                                    enum gmmu_pgsz_gk20a pgsz_idx,
1660                                    struct sg_table *sgt,
1661                                    u64 buffer_offset,
1662                                    u64 first_vaddr, u64 last_vaddr,
1663                                    u8 kind_v, u32 ctag_offset,
1664                                    bool cacheable,
1665                                    int rw_flag)
1666 {
1667         int err;
1668         u32 pde_lo, pde_hi, pde_i;
1669         struct scatterlist *cur_chunk;
1670         unsigned int cur_offset;
1671         u32 pte_w[2] = {0, 0}; /* invalid pte */
1672         u32 ctag = ctag_offset;
1673         u32 ctag_incr;
1674         u32 page_size  = gmmu_page_sizes[pgsz_idx];
1675         u64 addr = 0;
1676         u64 space_to_skip = buffer_offset;
1677
1678         pde_range_from_vaddr_range(vm, first_vaddr, last_vaddr,
1679                                    &pde_lo, &pde_hi);
1680
1681         gk20a_dbg(gpu_dbg_pte, "size_idx=%d, pde_lo=%d, pde_hi=%d",
1682                    pgsz_idx, pde_lo, pde_hi);
1683
1684         /* If ctag_offset !=0 add 1 else add 0.  The idea is to avoid a branch
1685          * below (per-pte). Note: this doesn't work unless page size (when
1686          * comptags are active) is 128KB. We have checks elsewhere for that. */
1687         ctag_incr = !!ctag_offset;
1688
1689         cur_offset = 0;
1690         if (sgt) {
1691                 cur_chunk = sgt->sgl;
1692                 /* space_to_skip must be page aligned */
1693                 BUG_ON(space_to_skip & (page_size - 1));
1694
1695                 while (space_to_skip > 0 && cur_chunk) {
1696                         u64 new_addr = gk20a_mm_iova_addr(cur_chunk);
1697                         if (new_addr) {
1698                                 addr = new_addr;
1699                                 addr += cur_offset;
1700                         }
1701                         cur_offset += page_size;
1702                         addr += page_size;
1703                         while (cur_chunk &&
1704                                 cur_offset >= cur_chunk->length) {
1705                                 cur_offset -= cur_chunk->length;
1706                                 cur_chunk = sg_next(cur_chunk);
1707                         }
1708                         space_to_skip -= page_size;
1709                 }
1710         }
1711         else
1712                 cur_chunk = NULL;
1713
1714         for (pde_i = pde_lo; pde_i <= pde_hi; pde_i++) {
1715                 u32 pte_lo, pte_hi;
1716                 u32 pte_cur;
1717                 void *pte_kv_cur;
1718
1719                 struct page_table_gk20a *pte = vm->pdes.ptes[pgsz_idx] + pde_i;
1720
1721                 if (pde_i == pde_lo)
1722                         pte_lo = pte_index_from_vaddr(vm, first_vaddr,
1723                                                       pgsz_idx);
1724                 else
1725                         pte_lo = 0;
1726
1727                 if ((pde_i != pde_hi) && (pde_hi != pde_lo))
1728                         pte_hi = vm->mm->page_table_sizing[pgsz_idx].num_ptes-1;
1729                 else
1730                         pte_hi = pte_index_from_vaddr(vm, last_vaddr,
1731                                                       pgsz_idx);
1732
1733                 /* get cpu access to the ptes */
1734                 err = map_gmmu_pages(pte->ref, pte->sgt, &pte_kv_cur,
1735                                      pte->size);
1736                 if (err) {
1737                         gk20a_err(dev_from_vm(vm),
1738                                    "couldn't map ptes for update as=%d pte_ref_cnt=%d",
1739                                    vm_aspace_id(vm), pte->ref_cnt);
1740                         goto clean_up;
1741                 }
1742
1743                 gk20a_dbg(gpu_dbg_pte, "pte_lo=%d, pte_hi=%d", pte_lo, pte_hi);
1744                 for (pte_cur = pte_lo; pte_cur <= pte_hi; pte_cur++) {
1745                         if (likely(sgt)) {
1746                                 u64 new_addr = gk20a_mm_iova_addr(cur_chunk);
1747                                 if (new_addr) {
1748                                         addr = new_addr;
1749                                         addr += cur_offset;
1750                                 }
1751                                 pte_w[0] = gmmu_pte_valid_true_f() |
1752                                         gmmu_pte_address_sys_f(addr
1753                                                 >> gmmu_pte_address_shift_v());
1754                                 pte_w[1] = gmmu_pte_aperture_video_memory_f() |
1755                                         gmmu_pte_kind_f(kind_v) |
1756                                         gmmu_pte_comptagline_f(ctag);
1757
1758                                 if (rw_flag == gk20a_mem_flag_read_only) {
1759                                         pte_w[0] |= gmmu_pte_read_only_true_f();
1760                                         pte_w[1] |=
1761                                                 gmmu_pte_write_disable_true_f();
1762                                 } else if (rw_flag ==
1763                                            gk20a_mem_flag_write_only) {
1764                                         pte_w[1] |=
1765                                                 gmmu_pte_read_disable_true_f();
1766                                 }
1767                                 if (!cacheable)
1768                                         pte_w[1] |= gmmu_pte_vol_true_f();
1769
1770                                 pte->ref_cnt++;
1771                                 gk20a_dbg(gpu_dbg_pte, "pte_cur=%d addr=0x%x,%08x kind=%d"
1772                                            " ctag=%d vol=%d refs=%d"
1773                                            " [0x%08x,0x%08x]",
1774                                            pte_cur, hi32(addr), lo32(addr),
1775                                            kind_v, ctag, !cacheable,
1776                                            pte->ref_cnt, pte_w[1], pte_w[0]);
1777                                 ctag += ctag_incr;
1778                                 cur_offset += page_size;
1779                                 addr += page_size;
1780                                 while (cur_chunk &&
1781                                         cur_offset >= cur_chunk->length) {
1782                                         cur_offset -= cur_chunk->length;
1783                                         cur_chunk = sg_next(cur_chunk);
1784                                 }
1785
1786                         } else {
1787                                 pte->ref_cnt--;
1788                                 gk20a_dbg(gpu_dbg_pte,
1789                                            "pte_cur=%d ref=%d [0x0,0x0]",
1790                                            pte_cur, pte->ref_cnt);
1791                         }
1792
1793                         gk20a_mem_wr32(pte_kv_cur + pte_cur*8, 0, pte_w[0]);
1794                         gk20a_mem_wr32(pte_kv_cur + pte_cur*8, 1, pte_w[1]);
1795                 }
1796
1797                 unmap_gmmu_pages(pte->ref, pte->sgt, pte_kv_cur);
1798
1799                 if (pte->ref_cnt == 0) {
1800                         void *pte_ref_ptr = pte->ref;
1801
1802                         /* It can make sense to keep around one page table for
1803                          * each flavor (empty)... in case a new map is coming
1804                          * right back to alloc (and fill it in) again.
1805                          * But: deferring unmapping should help with pathologic
1806                          * unmap/map/unmap/map cases where we'd trigger pte
1807                          * free/alloc/free/alloc.
1808                          */
1809                         pte->ref = NULL;
1810
1811                         /* rewrite pde */
1812                         update_gmmu_pde_locked(vm, pde_i);
1813
1814                         free_gmmu_pages(vm, pte_ref_ptr, pte->sgt,
1815                                 vm->mm->page_table_sizing[pgsz_idx].order,
1816                                 pte->size);
1817
1818                 }
1819
1820         }
1821
1822         smp_mb();
1823         vm->tlb_dirty = true;
1824         gk20a_dbg_fn("set tlb dirty");
1825
1826         return 0;
1827
1828 clean_up:
1829         /*TBD: potentially rewrite above to pre-map everything it needs to
1830          * as that's the only way it can fail */
1831         return err;
1832
1833 }
1834
1835
1836 /* for gk20a the "video memory" apertures here are misnomers. */
1837 static inline u32 big_valid_pde0_bits(u64 pte_addr)
1838 {
1839         u32 pde0_bits =
1840                 gmmu_pde_aperture_big_video_memory_f() |
1841                 gmmu_pde_address_big_sys_f(
1842                            (u32)(pte_addr >> gmmu_pde_address_shift_v()));
1843         return  pde0_bits;
1844 }
1845 static inline u32 small_valid_pde1_bits(u64 pte_addr)
1846 {
1847         u32 pde1_bits =
1848                 gmmu_pde_aperture_small_video_memory_f() |
1849                 gmmu_pde_vol_small_true_f() | /* tbd: why? */
1850                 gmmu_pde_address_small_sys_f(
1851                            (u32)(pte_addr >> gmmu_pde_address_shift_v()));
1852         return pde1_bits;
1853 }
1854
1855 /* Given the current state of the ptes associated with a pde,
1856    determine value and write it out.  There's no checking
1857    here to determine whether or not a change was actually
1858    made.  So, superfluous updates will cause unnecessary
1859    pde invalidations.
1860 */
1861 static void update_gmmu_pde_locked(struct vm_gk20a *vm, u32 i)
1862 {
1863         bool small_valid, big_valid;
1864         u64 pte_addr[2] = {0, 0};
1865         struct page_table_gk20a *small_pte =
1866                 vm->pdes.ptes[gmmu_page_size_small] + i;
1867         struct page_table_gk20a *big_pte =
1868                 vm->pdes.ptes[gmmu_page_size_big] + i;
1869         u32 pde_v[2] = {0, 0};
1870         u32 *pde;
1871
1872         small_valid = small_pte && small_pte->ref;
1873         big_valid   = big_pte && big_pte->ref;
1874
1875         if (small_valid)
1876                 pte_addr[gmmu_page_size_small] =
1877                         gk20a_mm_iova_addr(small_pte->sgt->sgl);
1878         if (big_valid)
1879                 pte_addr[gmmu_page_size_big] =
1880                         gk20a_mm_iova_addr(big_pte->sgt->sgl);
1881
1882         pde_v[0] = gmmu_pde_size_full_f();
1883         pde_v[0] |= big_valid ?
1884                 big_valid_pde0_bits(pte_addr[gmmu_page_size_big])
1885                 :
1886                 (gmmu_pde_aperture_big_invalid_f());
1887
1888         pde_v[1] |= (small_valid ?
1889                      small_valid_pde1_bits(pte_addr[gmmu_page_size_small])
1890                      :
1891                      (gmmu_pde_aperture_small_invalid_f() |
1892                       gmmu_pde_vol_small_false_f())
1893                      )
1894                 |
1895                 (big_valid ? (gmmu_pde_vol_big_true_f()) :
1896                  gmmu_pde_vol_big_false_f());
1897
1898         pde = pde_from_index(vm, i);
1899
1900         gk20a_mem_wr32(pde, 0, pde_v[0]);
1901         gk20a_mem_wr32(pde, 1, pde_v[1]);
1902
1903         smp_mb();
1904
1905         FLUSH_CPU_DCACHE(pde,
1906                          sg_phys(vm->pdes.sgt->sgl) + (i*gmmu_pde__size_v()),
1907                          sizeof(u32)*2);
1908
1909         gk20a_mm_l2_invalidate(vm->mm->g);
1910
1911         gk20a_dbg(gpu_dbg_pte, "pde:%d = 0x%x,0x%08x\n", i, pde_v[1], pde_v[0]);
1912
1913         vm->tlb_dirty  = true;
1914 }
1915
1916
1917 static int gk20a_vm_put_empty(struct vm_gk20a *vm, u64 vaddr,
1918                                u32 num_pages, u32 pgsz_idx)
1919 {
1920         struct mm_gk20a *mm = vm->mm;
1921         struct gk20a *g = mm->g;
1922         u32 pgsz = gmmu_page_sizes[pgsz_idx];
1923         u32 i;
1924         dma_addr_t iova;
1925
1926         /* allocate the zero page if the va does not already have one */
1927         if (!vm->zero_page_cpuva) {
1928                 int err = 0;
1929                 vm->zero_page_cpuva = dma_alloc_coherent(&g->dev->dev,
1930                                                          mm->big_page_size,
1931                                                          &iova,
1932                                                          GFP_KERNEL);
1933                 if (!vm->zero_page_cpuva) {
1934                         dev_err(&g->dev->dev, "failed to allocate zero page\n");
1935                         return -ENOMEM;
1936                 }
1937
1938                 vm->zero_page_iova = iova;
1939                 err = gk20a_get_sgtable(&g->dev->dev, &vm->zero_page_sgt,
1940                                         vm->zero_page_cpuva, vm->zero_page_iova,
1941                                         mm->big_page_size);
1942                 if (err) {
1943                         dma_free_coherent(&g->dev->dev, mm->big_page_size,
1944                                           vm->zero_page_cpuva,
1945                                           vm->zero_page_iova);
1946                         vm->zero_page_iova = 0;
1947                         vm->zero_page_cpuva = NULL;
1948
1949                         dev_err(&g->dev->dev, "failed to create sg table for zero page\n");
1950                         return -ENOMEM;
1951                 }
1952         }
1953
1954         for (i = 0; i < num_pages; i++) {
1955                 u64 page_vaddr = __locked_gmmu_map(vm, vaddr,
1956                         vm->zero_page_sgt, 0, pgsz, pgsz_idx, 0, 0,
1957                         NVHOST_AS_ALLOC_SPACE_FLAGS_FIXED_OFFSET,
1958                         gk20a_mem_flag_none);
1959
1960                 if (!page_vaddr) {
1961                         gk20a_err(dev_from_vm(vm), "failed to remap clean buffers!");
1962                         goto err_unmap;
1963                 }
1964                 vaddr += pgsz;
1965         }
1966
1967         return 0;
1968
1969 err_unmap:
1970
1971         WARN_ON(1);
1972         /* something went wrong. unmap pages */
1973         while (i--) {
1974                 vaddr -= pgsz;
1975                 __locked_gmmu_unmap(vm, vaddr, pgsz, pgsz_idx, 0,
1976                                     gk20a_mem_flag_none);
1977         }
1978
1979         return -EINVAL;
1980 }
1981
1982 /* NOTE! mapped_buffers lock must be held */
1983 static void gk20a_vm_unmap_locked(struct mapped_buffer_node *mapped_buffer)
1984 {
1985         struct vm_gk20a *vm = mapped_buffer->vm;
1986
1987         if (mapped_buffer->va_node &&
1988             mapped_buffer->va_node->sparse) {
1989                 u64 vaddr = mapped_buffer->addr;
1990                 u32 pgsz_idx = mapped_buffer->pgsz_idx;
1991                 u32 num_pages = mapped_buffer->size >>
1992                         gmmu_page_shifts[pgsz_idx];
1993
1994                 /* there is little we can do if this fails... */
1995                 gk20a_vm_put_empty(vm, vaddr, num_pages, pgsz_idx);
1996
1997         } else
1998                 __locked_gmmu_unmap(vm,
1999                                 mapped_buffer->addr,
2000                                 mapped_buffer->size,
2001                                 mapped_buffer->pgsz_idx,
2002                                 mapped_buffer->va_allocated,
2003                                 gk20a_mem_flag_none);
2004
2005         gk20a_dbg(gpu_dbg_map, "as=%d pgsz=%d gv=0x%x,%08x own_mem_ref=%d",
2006                    vm_aspace_id(vm), gmmu_page_sizes[mapped_buffer->pgsz_idx],
2007                    hi32(mapped_buffer->addr), lo32(mapped_buffer->addr),
2008                    mapped_buffer->own_mem_ref);
2009
2010         gk20a_mm_unpin(dev_from_vm(vm), mapped_buffer->dmabuf,
2011                        mapped_buffer->sgt);
2012
2013         /* remove from mapped buffer tree and remove list, free */
2014         rb_erase(&mapped_buffer->node, &vm->mapped_buffers);
2015         if (!list_empty(&mapped_buffer->va_buffers_list))
2016                 list_del(&mapped_buffer->va_buffers_list);
2017
2018         /* keep track of mapped buffers */
2019         if (mapped_buffer->user_mapped)
2020                 vm->num_user_mapped_buffers--;
2021
2022         if (mapped_buffer->own_mem_ref)
2023                 dma_buf_put(mapped_buffer->dmabuf);
2024
2025         kfree(mapped_buffer);
2026
2027         return;
2028 }
2029
2030 void gk20a_vm_unmap(struct vm_gk20a *vm, u64 offset)
2031 {
2032         struct device *d = dev_from_vm(vm);
2033         struct mapped_buffer_node *mapped_buffer;
2034
2035         mutex_lock(&vm->update_gmmu_lock);
2036         mapped_buffer = find_mapped_buffer_locked(&vm->mapped_buffers, offset);
2037         if (!mapped_buffer) {
2038                 mutex_unlock(&vm->update_gmmu_lock);
2039                 gk20a_err(d, "invalid addr to unmap 0x%llx", offset);
2040                 return;
2041         }
2042
2043         kref_put(&mapped_buffer->ref, gk20a_vm_unmap_locked_kref);
2044         mutex_unlock(&vm->update_gmmu_lock);
2045 }
2046
2047 static void gk20a_vm_remove_support(struct vm_gk20a *vm)
2048 {
2049         struct gk20a *g = vm->mm->g;
2050         struct mapped_buffer_node *mapped_buffer;
2051         struct vm_reserved_va_node *va_node, *va_node_tmp;
2052         struct rb_node *node;
2053         int i;
2054
2055         gk20a_dbg_fn("");
2056         mutex_lock(&vm->update_gmmu_lock);
2057
2058         /* TBD: add a flag here for the unmap code to recognize teardown
2059          * and short-circuit any otherwise expensive operations. */
2060
2061         node = rb_first(&vm->mapped_buffers);
2062         while (node) {
2063                 mapped_buffer =
2064                         container_of(node, struct mapped_buffer_node, node);
2065                 gk20a_vm_unmap_locked(mapped_buffer);
2066                 node = rb_first(&vm->mapped_buffers);
2067         }
2068
2069         /* destroy remaining reserved memory areas */
2070         list_for_each_entry_safe(va_node, va_node_tmp, &vm->reserved_va_list,
2071                 reserved_va_list) {
2072                 list_del(&va_node->reserved_va_list);
2073                 kfree(va_node);
2074         }
2075
2076         /* unmapping all buffers above may not actually free
2077          * all vm ptes.  jettison them here for certain... */
2078         for (i = 0; i < vm->pdes.num_pdes; i++) {
2079                 struct page_table_gk20a *pte =
2080                         &vm->pdes.ptes[gmmu_page_size_small][i];
2081                 if (pte->ref) {
2082                         free_gmmu_pages(vm, pte->ref, pte->sgt,
2083                                 vm->mm->page_table_sizing[gmmu_page_size_small].order,
2084                                 pte->size);
2085                         pte->ref = NULL;
2086                 }
2087                 pte = &vm->pdes.ptes[gmmu_page_size_big][i];
2088                 if (pte->ref) {
2089                         free_gmmu_pages(vm, pte->ref, pte->sgt,
2090                                 vm->mm->page_table_sizing[gmmu_page_size_big].order,
2091                                 pte->size);
2092                         pte->ref = NULL;
2093                 }
2094         }
2095
2096         unmap_gmmu_pages(vm->pdes.ref, vm->pdes.sgt, vm->pdes.kv);
2097         free_gmmu_pages(vm, vm->pdes.ref, vm->pdes.sgt, 0, vm->pdes.size);
2098
2099         kfree(vm->pdes.ptes[gmmu_page_size_small]);
2100         kfree(vm->pdes.ptes[gmmu_page_size_big]);
2101         gk20a_allocator_destroy(&vm->vma[gmmu_page_size_small]);
2102         gk20a_allocator_destroy(&vm->vma[gmmu_page_size_big]);
2103
2104         mutex_unlock(&vm->update_gmmu_lock);
2105
2106         /* release zero page if used */
2107         if (vm->zero_page_cpuva)
2108                 dma_free_coherent(&g->dev->dev, vm->mm->big_page_size,
2109                                   vm->zero_page_cpuva, vm->zero_page_iova);
2110
2111         /* vm is not used anymore. release it. */
2112         kfree(vm);
2113 }
2114
2115 static void gk20a_vm_remove_support_kref(struct kref *ref)
2116 {
2117         struct vm_gk20a *vm = container_of(ref, struct vm_gk20a, ref);
2118         gk20a_vm_remove_support(vm);
2119 }
2120
2121 void gk20a_vm_get(struct vm_gk20a *vm)
2122 {
2123         kref_get(&vm->ref);
2124 }
2125
2126 void gk20a_vm_put(struct vm_gk20a *vm)
2127 {
2128         kref_put(&vm->ref, gk20a_vm_remove_support_kref);
2129 }
2130
2131 /* address space interfaces for the gk20a module */
2132 int gk20a_vm_alloc_share(struct gk20a_as_share *as_share)
2133 {
2134         struct gk20a_as *as = as_share->as;
2135         struct gk20a *g = gk20a_from_as(as);
2136         struct mm_gk20a *mm = &g->mm;
2137         struct vm_gk20a *vm;
2138         u64 vma_size;
2139         u32 num_pages, low_hole_pages;
2140         char name[32];
2141         int err;
2142
2143         gk20a_dbg_fn("");
2144
2145         vm = kzalloc(sizeof(*vm), GFP_KERNEL);
2146         if (!vm)
2147                 return -ENOMEM;
2148
2149         as_share->vm = vm;
2150
2151         vm->mm = mm;
2152         vm->as_share = as_share;
2153
2154         vm->big_pages = true;
2155
2156         vm->va_start  = mm->pde_stride;   /* create a one pde hole */
2157         vm->va_limit  = mm->channel.size; /* note this means channel.size is
2158                                              really just the max */
2159         {
2160                 u32 pde_lo, pde_hi;
2161                 pde_range_from_vaddr_range(vm,
2162                                            0, vm->va_limit-1,
2163                                            &pde_lo, &pde_hi);
2164                 vm->pdes.num_pdes = pde_hi + 1;
2165         }
2166
2167         vm->pdes.ptes[gmmu_page_size_small] =
2168                 kzalloc(sizeof(struct page_table_gk20a) *
2169                         vm->pdes.num_pdes, GFP_KERNEL);
2170
2171         vm->pdes.ptes[gmmu_page_size_big] =
2172                 kzalloc(sizeof(struct page_table_gk20a) *
2173                         vm->pdes.num_pdes, GFP_KERNEL);
2174
2175         if (!(vm->pdes.ptes[gmmu_page_size_small] &&
2176               vm->pdes.ptes[gmmu_page_size_big]))
2177                 return -ENOMEM;
2178
2179         gk20a_dbg_info("init space for va_limit=0x%llx num_pdes=%d",
2180                    vm->va_limit, vm->pdes.num_pdes);
2181
2182         /* allocate the page table directory */
2183         err = alloc_gmmu_pages(vm, 0, &vm->pdes.ref,
2184                                &vm->pdes.sgt, &vm->pdes.size);
2185         if (err)
2186                 return -ENOMEM;
2187
2188         err = map_gmmu_pages(vm->pdes.ref, vm->pdes.sgt, &vm->pdes.kv,
2189                              vm->pdes.size);
2190         if (err) {
2191                 free_gmmu_pages(vm, vm->pdes.ref, vm->pdes.sgt, 0,
2192                                         vm->pdes.size);
2193                 return -ENOMEM;
2194         }
2195         gk20a_dbg(gpu_dbg_pte, "pdes.kv = 0x%p, pdes.phys = 0x%llx",
2196                         vm->pdes.kv,
2197                         gk20a_mm_iova_addr(vm->pdes.sgt->sgl));
2198         /* we could release vm->pdes.kv but it's only one page... */
2199
2200
2201         /* low-half: alloc small pages */
2202         /* high-half: alloc big pages */
2203         vma_size = mm->channel.size >> 1;
2204
2205         snprintf(name, sizeof(name), "gk20a_as_%d-%dKB", as_share->id,
2206                  gmmu_page_sizes[gmmu_page_size_small]>>10);
2207         num_pages = (u32)(vma_size >> gmmu_page_shifts[gmmu_page_size_small]);
2208
2209         /* num_pages above is without regard to the low-side hole. */
2210         low_hole_pages = (vm->va_start >>
2211                           gmmu_page_shifts[gmmu_page_size_small]);
2212
2213         gk20a_allocator_init(&vm->vma[gmmu_page_size_small], name,
2214               low_hole_pages,             /* start */
2215               num_pages - low_hole_pages, /* length */
2216               1);                         /* align */
2217
2218         snprintf(name, sizeof(name), "gk20a_as_%d-%dKB", as_share->id,
2219                  gmmu_page_sizes[gmmu_page_size_big]>>10);
2220
2221         num_pages = (u32)(vma_size >> gmmu_page_shifts[gmmu_page_size_big]);
2222         gk20a_allocator_init(&vm->vma[gmmu_page_size_big], name,
2223                               num_pages, /* start */
2224                               num_pages, /* length */
2225                               1); /* align */
2226
2227         vm->mapped_buffers = RB_ROOT;
2228
2229         mutex_init(&vm->update_gmmu_lock);
2230         kref_init(&vm->ref);
2231         INIT_LIST_HEAD(&vm->reserved_va_list);
2232
2233         vm->enable_ctag = true;
2234
2235         return 0;
2236 }
2237
2238
2239 int gk20a_vm_release_share(struct gk20a_as_share *as_share)
2240 {
2241         struct vm_gk20a *vm = as_share->vm;
2242
2243         gk20a_dbg_fn("");
2244
2245         vm->as_share = NULL;
2246
2247         /* put as reference to vm */
2248         gk20a_vm_put(vm);
2249
2250         as_share->vm = NULL;
2251
2252         return 0;
2253 }
2254
2255
2256 int gk20a_vm_alloc_space(struct gk20a_as_share *as_share,
2257                          struct nvhost_as_alloc_space_args *args)
2258
2259 {       int err = -ENOMEM;
2260         int pgsz_idx;
2261         u32 start_page_nr;
2262         struct gk20a_allocator *vma;
2263         struct vm_gk20a *vm = as_share->vm;
2264         struct vm_reserved_va_node *va_node;
2265         u64 vaddr_start = 0;
2266
2267         gk20a_dbg_fn("flags=0x%x pgsz=0x%x nr_pages=0x%x o/a=0x%llx",
2268                         args->flags, args->page_size, args->pages,
2269                         args->o_a.offset);
2270
2271         /* determine pagesz idx */
2272         for (pgsz_idx = gmmu_page_size_small;
2273              pgsz_idx < gmmu_nr_page_sizes;
2274              pgsz_idx++) {
2275                 if (gmmu_page_sizes[pgsz_idx] == args->page_size)
2276                         break;
2277         }
2278
2279         if (pgsz_idx >= gmmu_nr_page_sizes) {
2280                 err = -EINVAL;
2281                 goto clean_up;
2282         }
2283
2284         va_node = kzalloc(sizeof(*va_node), GFP_KERNEL);
2285         if (!va_node) {
2286                 err = -ENOMEM;
2287                 goto clean_up;
2288         }
2289
2290         if (args->flags & NVHOST_AS_ALLOC_SPACE_FLAGS_SPARSE &&
2291             pgsz_idx != gmmu_page_size_big) {
2292                 err = -ENOSYS;
2293                 kfree(va_node);
2294                 goto clean_up;
2295         }
2296
2297         start_page_nr = 0;
2298         if (args->flags & NVHOST_AS_ALLOC_SPACE_FLAGS_FIXED_OFFSET)
2299                 start_page_nr = (u32)(args->o_a.offset >>
2300                                       gmmu_page_shifts[pgsz_idx]);
2301
2302         vma = &vm->vma[pgsz_idx];
2303         err = vma->alloc(vma, &start_page_nr, args->pages);
2304         if (err) {
2305                 kfree(va_node);
2306                 goto clean_up;
2307         }
2308
2309         vaddr_start = (u64)start_page_nr << gmmu_page_shifts[pgsz_idx];
2310
2311         va_node->vaddr_start = vaddr_start;
2312         va_node->size = (u64)args->page_size * (u64)args->pages;
2313         va_node->pgsz_idx = args->page_size;
2314         INIT_LIST_HEAD(&va_node->va_buffers_list);
2315         INIT_LIST_HEAD(&va_node->reserved_va_list);
2316
2317         mutex_lock(&vm->update_gmmu_lock);
2318
2319         /* mark that we need to use sparse mappings here */
2320         if (args->flags & NVHOST_AS_ALLOC_SPACE_FLAGS_SPARSE) {
2321                 err = gk20a_vm_put_empty(vm, vaddr_start, args->pages,
2322                                          pgsz_idx);
2323                 if (err) {
2324                         mutex_unlock(&vm->update_gmmu_lock);
2325                         vma->free(vma, start_page_nr, args->pages);
2326                         kfree(va_node);
2327                         goto clean_up;
2328                 }
2329
2330                 va_node->sparse = true;
2331         }
2332         list_add_tail(&va_node->reserved_va_list, &vm->reserved_va_list);
2333
2334         mutex_unlock(&vm->update_gmmu_lock);
2335
2336         args->o_a.offset = vaddr_start;
2337
2338 clean_up:
2339         return err;
2340 }
2341
2342 int gk20a_vm_free_space(struct gk20a_as_share *as_share,
2343                         struct nvhost_as_free_space_args *args)
2344 {
2345         int err = -ENOMEM;
2346         int pgsz_idx;
2347         u32 start_page_nr;
2348         struct gk20a_allocator *vma;
2349         struct vm_gk20a *vm = as_share->vm;
2350         struct vm_reserved_va_node *va_node;
2351
2352         gk20a_dbg_fn("pgsz=0x%x nr_pages=0x%x o/a=0x%llx", args->page_size,
2353                         args->pages, args->offset);
2354
2355         /* determine pagesz idx */
2356         for (pgsz_idx = gmmu_page_size_small;
2357              pgsz_idx < gmmu_nr_page_sizes;
2358              pgsz_idx++) {
2359                 if (gmmu_page_sizes[pgsz_idx] == args->page_size)
2360                         break;
2361         }
2362
2363         if (pgsz_idx >= gmmu_nr_page_sizes) {
2364                 err = -EINVAL;
2365                 goto clean_up;
2366         }
2367
2368         start_page_nr = (u32)(args->offset >>
2369                               gmmu_page_shifts[pgsz_idx]);
2370
2371         vma = &vm->vma[pgsz_idx];
2372         err = vma->free(vma, start_page_nr, args->pages);
2373
2374         if (err)
2375                 goto clean_up;
2376
2377         mutex_lock(&vm->update_gmmu_lock);
2378         va_node = addr_to_reservation(vm, args->offset);
2379         if (va_node) {
2380                 struct mapped_buffer_node *buffer;
2381
2382                 /* there is no need to unallocate the buffers in va. Just
2383                  * convert them into normal buffers */
2384
2385                 list_for_each_entry(buffer,
2386                         &va_node->va_buffers_list, va_buffers_list)
2387                         list_del_init(&buffer->va_buffers_list);
2388
2389                 list_del(&va_node->reserved_va_list);
2390
2391                 /* if this was a sparse mapping, free the va */
2392                 if (va_node->sparse)
2393                         __locked_gmmu_unmap(vm,
2394                                 va_node->vaddr_start,
2395                                 va_node->size,
2396                                 va_node->pgsz_idx,
2397                                 false,
2398                                 gk20a_mem_flag_none);
2399                 kfree(va_node);
2400         }
2401         mutex_unlock(&vm->update_gmmu_lock);
2402
2403 clean_up:
2404         return err;
2405 }
2406
2407 int gk20a_vm_bind_channel(struct gk20a_as_share *as_share,
2408                           struct channel_gk20a *ch)
2409 {
2410         int err = 0;
2411         struct vm_gk20a *vm = as_share->vm;
2412
2413         gk20a_dbg_fn("");
2414
2415         ch->vm = vm;
2416         err = channel_gk20a_commit_va(ch);
2417         if (err)
2418                 ch->vm = 0;
2419
2420         return err;
2421 }
2422
2423 int gk20a_dmabuf_alloc_drvdata(struct dma_buf *dmabuf, struct device *dev)
2424 {
2425         struct gk20a_dmabuf_priv *priv;
2426         static DEFINE_MUTEX(priv_lock);
2427
2428         priv = dma_buf_get_drvdata(dmabuf, dev);
2429         if (likely(priv))
2430                 return 0;
2431
2432         mutex_lock(&priv_lock);
2433         priv = dma_buf_get_drvdata(dmabuf, dev);
2434         if (priv)
2435                 goto priv_exist_or_err;
2436         priv = kzalloc(sizeof(*priv), GFP_KERNEL);
2437         if (!priv) {
2438                 priv = ERR_PTR(-ENOMEM);
2439                 goto priv_exist_or_err;
2440         }
2441         mutex_init(&priv->lock);
2442         dma_buf_set_drvdata(dmabuf, dev, priv, gk20a_mm_delete_priv);
2443 priv_exist_or_err:
2444         mutex_unlock(&priv_lock);
2445         if (IS_ERR(priv))
2446                 return -ENOMEM;
2447
2448         return 0;
2449 }
2450
2451
2452 static int gk20a_dmabuf_get_kind(struct dma_buf *dmabuf)
2453 {
2454         int kind = 0;
2455 #ifdef CONFIG_TEGRA_NVMAP
2456         int err;
2457         u64 nvmap_param;
2458
2459         err = nvmap_get_dmabuf_param(dmabuf, NVMAP_HANDLE_PARAM_KIND,
2460                                      &nvmap_param);
2461         kind = err ? kind : nvmap_param;
2462 #endif
2463         return kind;
2464 }
2465
2466 int gk20a_vm_map_buffer(struct gk20a_as_share *as_share,
2467                         int dmabuf_fd,
2468                         u64 *offset_align,
2469                         u32 flags, /*NVHOST_AS_MAP_BUFFER_FLAGS_*/
2470                         int kind,
2471                         u64 buffer_offset,
2472                         u64 mapping_size)
2473 {
2474         int err = 0;
2475         struct vm_gk20a *vm = as_share->vm;
2476         struct dma_buf *dmabuf;
2477         u64 ret_va;
2478
2479         gk20a_dbg_fn("");
2480
2481         /* get ref to the mem handle (released on unmap_locked) */
2482         dmabuf = dma_buf_get(dmabuf_fd);
2483         if (!dmabuf)
2484                 return 0;
2485
2486         err = gk20a_dmabuf_alloc_drvdata(dmabuf, dev_from_vm(vm));
2487         if (err) {
2488                 dma_buf_put(dmabuf);
2489                 return err;
2490         }
2491
2492         if (kind == -1)
2493                 kind = gk20a_dmabuf_get_kind(dmabuf);
2494
2495         ret_va = gk20a_vm_map(vm, dmabuf, *offset_align,
2496                         flags, kind, NULL, true,
2497                         gk20a_mem_flag_none,
2498                         buffer_offset,
2499                         mapping_size);
2500
2501         *offset_align = ret_va;
2502         if (!ret_va) {
2503                 dma_buf_put(dmabuf);
2504                 err = -EINVAL;
2505         }
2506
2507         return err;
2508 }
2509
2510 int gk20a_vm_unmap_buffer(struct gk20a_as_share *as_share, u64 offset)
2511 {
2512         struct vm_gk20a *vm = as_share->vm;
2513
2514         gk20a_dbg_fn("");
2515
2516         gk20a_vm_unmap_user(vm, offset);
2517         return 0;
2518 }
2519
2520 int gk20a_init_bar1_vm(struct mm_gk20a *mm)
2521 {
2522         int err;
2523         phys_addr_t inst_pa;
2524         void *inst_ptr;
2525         struct vm_gk20a *vm = &mm->bar1.vm;
2526         struct gk20a *g = gk20a_from_mm(mm);
2527         struct device *d = dev_from_gk20a(g);
2528         struct inst_desc *inst_block = &mm->bar1.inst_block;
2529         u64 pde_addr;
2530         u32 pde_addr_lo;
2531         u32 pde_addr_hi;
2532         dma_addr_t iova;
2533
2534         vm->mm = mm;
2535
2536         mm->bar1.aperture_size = bar1_aperture_size_mb_gk20a() << 20;
2537
2538         gk20a_dbg_info("bar1 vm size = 0x%x", mm->bar1.aperture_size);
2539
2540         vm->va_start = mm->pde_stride * 1;
2541         vm->va_limit = mm->bar1.aperture_size;
2542
2543         {
2544                 u32 pde_lo, pde_hi;
2545                 pde_range_from_vaddr_range(vm,
2546                                            0, vm->va_limit-1,
2547                                            &pde_lo, &pde_hi);
2548                 vm->pdes.num_pdes = pde_hi + 1;
2549         }
2550
2551         /* bar1 is likely only to ever use/need small page sizes. */
2552         /* But just in case, for now... arrange for both.*/
2553         vm->pdes.ptes[gmmu_page_size_small] =
2554                 kzalloc(sizeof(struct page_table_gk20a) *
2555                         vm->pdes.num_pdes, GFP_KERNEL);
2556
2557         vm->pdes.ptes[gmmu_page_size_big] =
2558                 kzalloc(sizeof(struct page_table_gk20a) *
2559                         vm->pdes.num_pdes, GFP_KERNEL);
2560
2561         if (!(vm->pdes.ptes[gmmu_page_size_small] &&
2562               vm->pdes.ptes[gmmu_page_size_big]))
2563                 return -ENOMEM;
2564
2565         gk20a_dbg_info("init space for bar1 va_limit=0x%llx num_pdes=%d",
2566                    vm->va_limit, vm->pdes.num_pdes);
2567
2568
2569         /* allocate the page table directory */
2570         err = alloc_gmmu_pages(vm, 0, &vm->pdes.ref,
2571                                &vm->pdes.sgt, &vm->pdes.size);
2572         if (err)
2573                 goto clean_up;
2574
2575         err = map_gmmu_pages(vm->pdes.ref, vm->pdes.sgt, &vm->pdes.kv,
2576                              vm->pdes.size);
2577         if (err) {
2578                 free_gmmu_pages(vm, vm->pdes.ref, vm->pdes.sgt, 0,
2579                                         vm->pdes.size);
2580                 goto clean_up;
2581         }
2582         gk20a_dbg(gpu_dbg_pte, "bar 1 pdes.kv = 0x%p, pdes.phys = 0x%llx",
2583                         vm->pdes.kv, gk20a_mm_iova_addr(vm->pdes.sgt->sgl));
2584         /* we could release vm->pdes.kv but it's only one page... */
2585
2586         pde_addr = gk20a_mm_iova_addr(vm->pdes.sgt->sgl);
2587         pde_addr_lo = u64_lo32(pde_addr >> 12);
2588         pde_addr_hi = u64_hi32(pde_addr);
2589
2590         gk20a_dbg_info("pde pa=0x%llx pde_addr_lo=0x%x pde_addr_hi=0x%x",
2591                 (u64)gk20a_mm_iova_addr(vm->pdes.sgt->sgl),
2592                 pde_addr_lo, pde_addr_hi);
2593
2594         /* allocate instance mem for bar1 */
2595         inst_block->size = ram_in_alloc_size_v();
2596         inst_block->cpuva = dma_alloc_coherent(d, inst_block->size,
2597                                 &iova, GFP_KERNEL);
2598         if (!inst_block->cpuva) {
2599                 gk20a_err(d, "%s: memory allocation failed\n", __func__);
2600                 err = -ENOMEM;
2601                 goto clean_up;
2602         }
2603
2604         inst_block->iova = iova;
2605         inst_block->cpu_pa = gk20a_get_phys_from_iova(d, inst_block->iova);
2606         if (!inst_block->cpu_pa) {
2607                 gk20a_err(d, "%s: failed to get phys address\n", __func__);
2608                 err = -ENOMEM;
2609                 goto clean_up;
2610         }
2611
2612         inst_pa = inst_block->cpu_pa;
2613         inst_ptr = inst_block->cpuva;
2614
2615         gk20a_dbg_info("bar1 inst block physical phys = 0x%llx, kv = 0x%p",
2616                 (u64)inst_pa, inst_ptr);
2617
2618         memset(inst_ptr, 0, ram_fc_size_val_v());
2619
2620         gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_lo_w(),
2621                 ram_in_page_dir_base_target_vid_mem_f() |
2622                 ram_in_page_dir_base_vol_true_f() |
2623                 ram_in_page_dir_base_lo_f(pde_addr_lo));
2624
2625         gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_hi_w(),
2626                 ram_in_page_dir_base_hi_f(pde_addr_hi));
2627
2628         gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_lo_w(),
2629                  u64_lo32(vm->va_limit) | 0xFFF);
2630
2631         gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_hi_w(),
2632                 ram_in_adr_limit_hi_f(u64_hi32(vm->va_limit)));
2633
2634         gk20a_dbg_info("bar1 inst block ptr: %08llx",  (u64)inst_pa);
2635         gk20a_allocator_init(&vm->vma[gmmu_page_size_small], "gk20a_bar1",
2636                               1,/*start*/
2637                               (vm->va_limit >> 12) - 1 /* length*/,
2638                               1); /* align */
2639         /* initialize just in case we try to use it anyway */
2640         gk20a_allocator_init(&vm->vma[gmmu_page_size_big], "gk20a_bar1-unused",
2641                               0x0badc0de, /* start */
2642                               1, /* length */
2643                               1); /* align */
2644
2645         vm->mapped_buffers = RB_ROOT;
2646
2647         mutex_init(&vm->update_gmmu_lock);
2648         kref_init(&vm->ref);
2649         INIT_LIST_HEAD(&vm->reserved_va_list);
2650
2651         return 0;
2652
2653 clean_up:
2654         /* free, etc */
2655         if (inst_block->cpuva)
2656                 dma_free_coherent(d, inst_block->size,
2657                         inst_block->cpuva, inst_block->iova);
2658         inst_block->cpuva = NULL;
2659         inst_block->iova = 0;
2660         return err;
2661 }
2662
2663 /* pmu vm, share channel_vm interfaces */
2664 int gk20a_init_pmu_vm(struct mm_gk20a *mm)
2665 {
2666         int err;
2667         phys_addr_t inst_pa;
2668         void *inst_ptr;
2669         struct vm_gk20a *vm = &mm->pmu.vm;
2670         struct gk20a *g = gk20a_from_mm(mm);
2671         struct device *d = dev_from_gk20a(g);
2672         struct inst_desc *inst_block = &mm->pmu.inst_block;
2673         u64 pde_addr;
2674         u32 pde_addr_lo;
2675         u32 pde_addr_hi;
2676         dma_addr_t iova;
2677
2678         vm->mm = mm;
2679
2680         mm->pmu.aperture_size = GK20A_PMU_VA_SIZE;
2681
2682         gk20a_dbg_info("pmu vm size = 0x%x", mm->pmu.aperture_size);
2683
2684         vm->va_start  = GK20A_PMU_VA_START;
2685         vm->va_limit  = vm->va_start + mm->pmu.aperture_size;
2686
2687         {
2688                 u32 pde_lo, pde_hi;
2689                 pde_range_from_vaddr_range(vm,
2690                                            0, vm->va_limit-1,
2691                                            &pde_lo, &pde_hi);
2692                 vm->pdes.num_pdes = pde_hi + 1;
2693         }
2694
2695         /* The pmu is likely only to ever use/need small page sizes. */
2696         /* But just in case, for now... arrange for both.*/
2697         vm->pdes.ptes[gmmu_page_size_small] =
2698                 kzalloc(sizeof(struct page_table_gk20a) *
2699                         vm->pdes.num_pdes, GFP_KERNEL);
2700
2701         vm->pdes.ptes[gmmu_page_size_big] =
2702                 kzalloc(sizeof(struct page_table_gk20a) *
2703                         vm->pdes.num_pdes, GFP_KERNEL);
2704
2705         if (!(vm->pdes.ptes[gmmu_page_size_small] &&
2706               vm->pdes.ptes[gmmu_page_size_big]))
2707                 return -ENOMEM;
2708
2709         gk20a_dbg_info("init space for pmu va_limit=0x%llx num_pdes=%d",
2710                    vm->va_limit, vm->pdes.num_pdes);
2711
2712         /* allocate the page table directory */
2713         err = alloc_gmmu_pages(vm, 0, &vm->pdes.ref,
2714                                &vm->pdes.sgt, &vm->pdes.size);
2715         if (err)
2716                 goto clean_up;
2717
2718         err = map_gmmu_pages(vm->pdes.ref, vm->pdes.sgt, &vm->pdes.kv,
2719                              vm->pdes.size);
2720         if (err) {
2721                 free_gmmu_pages(vm, vm->pdes.ref, vm->pdes.sgt, 0,
2722                                         vm->pdes.size);
2723                 goto clean_up;
2724         }
2725         gk20a_dbg_info("pmu pdes phys @ 0x%llx",
2726                         (u64)gk20a_mm_iova_addr(vm->pdes.sgt->sgl));
2727         /* we could release vm->pdes.kv but it's only one page... */
2728
2729         pde_addr = gk20a_mm_iova_addr(vm->pdes.sgt->sgl);
2730         pde_addr_lo = u64_lo32(pde_addr >> 12);
2731         pde_addr_hi = u64_hi32(pde_addr);
2732
2733         gk20a_dbg_info("pde pa=0x%llx pde_addr_lo=0x%x pde_addr_hi=0x%x",
2734                         (u64)pde_addr, pde_addr_lo, pde_addr_hi);
2735
2736         /* allocate instance mem for pmu */
2737         inst_block->size = GK20A_PMU_INST_SIZE;
2738         inst_block->cpuva = dma_alloc_coherent(d, inst_block->size,
2739                                 &iova, GFP_KERNEL);
2740         if (!inst_block->cpuva) {
2741                 gk20a_err(d, "%s: memory allocation failed\n", __func__);
2742                 err = -ENOMEM;
2743                 goto clean_up;
2744         }
2745
2746         inst_block->iova = iova;
2747         inst_block->cpu_pa = gk20a_get_phys_from_iova(d, inst_block->iova);
2748         if (!inst_block->cpu_pa) {
2749                 gk20a_err(d, "%s: failed to get phys address\n", __func__);
2750                 err = -ENOMEM;
2751                 goto clean_up;
2752         }
2753
2754         inst_pa = inst_block->cpu_pa;
2755         inst_ptr = inst_block->cpuva;
2756
2757         gk20a_dbg_info("pmu inst block physical addr: 0x%llx", (u64)inst_pa);
2758
2759         memset(inst_ptr, 0, GK20A_PMU_INST_SIZE);
2760
2761         gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_lo_w(),
2762                 ram_in_page_dir_base_target_vid_mem_f() |
2763                 ram_in_page_dir_base_vol_true_f() |
2764                 ram_in_page_dir_base_lo_f(pde_addr_lo));
2765
2766         gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_hi_w(),
2767                 ram_in_page_dir_base_hi_f(pde_addr_hi));
2768
2769         gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_lo_w(),
2770                  u64_lo32(vm->va_limit) | 0xFFF);
2771
2772         gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_hi_w(),
2773                 ram_in_adr_limit_hi_f(u64_hi32(vm->va_limit)));
2774
2775         gk20a_allocator_init(&vm->vma[gmmu_page_size_small], "gk20a_pmu",
2776                               (vm->va_start >> 12), /* start */
2777                               (vm->va_limit - vm->va_start) >> 12, /*length*/
2778                               1); /* align */
2779         /* initialize just in case we try to use it anyway */
2780         gk20a_allocator_init(&vm->vma[gmmu_page_size_big], "gk20a_pmu-unused",
2781                               0x0badc0de, /* start */
2782                               1, /* length */
2783                               1); /* align */
2784
2785
2786         vm->mapped_buffers = RB_ROOT;
2787
2788         mutex_init(&vm->update_gmmu_lock);
2789         kref_init(&vm->ref);
2790         INIT_LIST_HEAD(&vm->reserved_va_list);
2791
2792         return 0;
2793
2794 clean_up:
2795         /* free, etc */
2796         if (inst_block->cpuva)
2797                 dma_free_coherent(d, inst_block->size,
2798                         inst_block->cpuva, inst_block->iova);
2799         inst_block->cpuva = NULL;
2800         inst_block->iova = 0;
2801         return err;
2802 }
2803
2804 int gk20a_mm_fb_flush(struct gk20a *g)
2805 {
2806         struct mm_gk20a *mm = &g->mm;
2807         u32 data;
2808         s32 retry = 100;
2809         int ret = 0;
2810
2811         gk20a_dbg_fn("");
2812
2813         mutex_lock(&mm->l2_op_lock);
2814
2815         /* Make sure all previous writes are committed to the L2. There's no
2816            guarantee that writes are to DRAM. This will be a sysmembar internal
2817            to the L2. */
2818         gk20a_writel(g, flush_fb_flush_r(),
2819                 flush_fb_flush_pending_busy_f());
2820
2821         do {
2822                 data = gk20a_readl(g, flush_fb_flush_r());
2823
2824                 if (flush_fb_flush_outstanding_v(data) ==
2825                         flush_fb_flush_outstanding_true_v() ||
2826                     flush_fb_flush_pending_v(data) ==
2827                         flush_fb_flush_pending_busy_v()) {
2828                                 gk20a_dbg_info("fb_flush 0x%x", data);
2829                                 retry--;
2830                                 usleep_range(20, 40);
2831                 } else
2832                         break;
2833         } while (retry >= 0 || !tegra_platform_is_silicon());
2834
2835         if (retry < 0) {
2836                 gk20a_warn(dev_from_gk20a(g),
2837                         "fb_flush too many retries");
2838                 ret = -EBUSY;
2839         }
2840
2841         mutex_unlock(&mm->l2_op_lock);
2842
2843         return ret;
2844 }
2845
2846 static void gk20a_mm_l2_invalidate_locked(struct gk20a *g)
2847 {
2848         u32 data;
2849         s32 retry = 200;
2850
2851         /* Invalidate any clean lines from the L2 so subsequent reads go to
2852            DRAM. Dirty lines are not affected by this operation. */
2853         gk20a_writel(g, flush_l2_system_invalidate_r(),
2854                 flush_l2_system_invalidate_pending_busy_f());
2855
2856         do {
2857                 data = gk20a_readl(g, flush_l2_system_invalidate_r());
2858
2859                 if (flush_l2_system_invalidate_outstanding_v(data) ==
2860                         flush_l2_system_invalidate_outstanding_true_v() ||
2861                     flush_l2_system_invalidate_pending_v(data) ==
2862                         flush_l2_system_invalidate_pending_busy_v()) {
2863                                 gk20a_dbg_info("l2_system_invalidate 0x%x",
2864                                                 data);
2865                                 retry--;
2866                                 usleep_range(20, 40);
2867                 } else
2868                         break;
2869         } while (retry >= 0 || !tegra_platform_is_silicon());
2870
2871         if (retry < 0)
2872                 gk20a_warn(dev_from_gk20a(g),
2873                         "l2_system_invalidate too many retries");
2874 }
2875
2876 void gk20a_mm_l2_invalidate(struct gk20a *g)
2877 {
2878         struct mm_gk20a *mm = &g->mm;
2879         mutex_lock(&mm->l2_op_lock);
2880         gk20a_mm_l2_invalidate_locked(g);
2881         mutex_unlock(&mm->l2_op_lock);
2882 }
2883
2884 void gk20a_mm_l2_flush(struct gk20a *g, bool invalidate)
2885 {
2886         struct mm_gk20a *mm = &g->mm;
2887         u32 data;
2888         s32 retry = 200;
2889
2890         gk20a_dbg_fn("");
2891
2892         mutex_lock(&mm->l2_op_lock);
2893
2894         /* Flush all dirty lines from the L2 to DRAM. Lines are left in the L2
2895            as clean, so subsequent reads might hit in the L2. */
2896         gk20a_writel(g, flush_l2_flush_dirty_r(),
2897                 flush_l2_flush_dirty_pending_busy_f());
2898
2899         do {
2900                 data = gk20a_readl(g, flush_l2_flush_dirty_r());
2901
2902                 if (flush_l2_flush_dirty_outstanding_v(data) ==
2903                         flush_l2_flush_dirty_outstanding_true_v() ||
2904                     flush_l2_flush_dirty_pending_v(data) ==
2905                         flush_l2_flush_dirty_pending_busy_v()) {
2906                                 gk20a_dbg_info("l2_flush_dirty 0x%x", data);
2907                                 retry--;
2908                                 usleep_range(20, 40);
2909                 } else
2910                         break;
2911         } while (retry >= 0 || !tegra_platform_is_silicon());
2912
2913         if (retry < 0)
2914                 gk20a_warn(dev_from_gk20a(g),
2915                         "l2_flush_dirty too many retries");
2916
2917         if (invalidate)
2918                 gk20a_mm_l2_invalidate_locked(g);
2919
2920         mutex_unlock(&mm->l2_op_lock);
2921 }
2922
2923
2924 int gk20a_vm_find_buffer(struct vm_gk20a *vm, u64 gpu_va,
2925                          struct dma_buf **dmabuf,
2926                          u64 *offset)
2927 {
2928         struct mapped_buffer_node *mapped_buffer;
2929
2930         gk20a_dbg_fn("gpu_va=0x%llx", gpu_va);
2931
2932         mutex_lock(&vm->update_gmmu_lock);
2933
2934         mapped_buffer = find_mapped_buffer_range_locked(&vm->mapped_buffers,
2935                                                         gpu_va);
2936         if (!mapped_buffer) {
2937                 mutex_unlock(&vm->update_gmmu_lock);
2938                 return -EINVAL;
2939         }
2940
2941         *dmabuf = mapped_buffer->dmabuf;
2942         *offset = gpu_va - mapped_buffer->addr;
2943
2944         mutex_unlock(&vm->update_gmmu_lock);
2945
2946         return 0;
2947 }
2948
2949 void gk20a_mm_tlb_invalidate(struct vm_gk20a *vm)
2950 {
2951         struct gk20a *g = gk20a_from_vm(vm);
2952         u32 addr_lo = u64_lo32(gk20a_mm_iova_addr(vm->pdes.sgt->sgl) >> 12);
2953         u32 data;
2954         s32 retry = 200;
2955         static DEFINE_MUTEX(tlb_lock);
2956
2957         gk20a_dbg_fn("");
2958
2959         /* pagetables are considered sw states which are preserved after
2960            prepare_poweroff. When gk20a deinit releases those pagetables,
2961            common code in vm unmap path calls tlb invalidate that touches
2962            hw. Use the power_on flag to skip tlb invalidation when gpu
2963            power is turned off */
2964
2965         if (!g->power_on)
2966                 return;
2967
2968         /* No need to invalidate if tlb is clean */
2969         mutex_lock(&vm->update_gmmu_lock);
2970         if (!vm->tlb_dirty) {
2971                 mutex_unlock(&vm->update_gmmu_lock);
2972                 return;
2973         }
2974
2975         mutex_lock(&tlb_lock);
2976         do {
2977                 data = gk20a_readl(g, fb_mmu_ctrl_r());
2978                 if (fb_mmu_ctrl_pri_fifo_space_v(data) != 0)
2979                         break;
2980                 usleep_range(20, 40);
2981                 retry--;
2982         } while (retry >= 0 || !tegra_platform_is_silicon());
2983
2984         if (retry < 0) {
2985                 gk20a_warn(dev_from_gk20a(g),
2986                         "wait mmu fifo space too many retries");
2987                 goto out;
2988         }
2989
2990         gk20a_writel(g, fb_mmu_invalidate_pdb_r(),
2991                 fb_mmu_invalidate_pdb_addr_f(addr_lo) |
2992                 fb_mmu_invalidate_pdb_aperture_vid_mem_f());
2993
2994         gk20a_writel(g, fb_mmu_invalidate_r(),
2995                 fb_mmu_invalidate_all_va_true_f() |
2996                 fb_mmu_invalidate_trigger_true_f());
2997
2998         do {
2999                 data = gk20a_readl(g, fb_mmu_ctrl_r());
3000                 if (fb_mmu_ctrl_pri_fifo_empty_v(data) !=
3001                         fb_mmu_ctrl_pri_fifo_empty_false_f())
3002                         break;
3003                 retry--;
3004                 usleep_range(20, 40);
3005         } while (retry >= 0 || !tegra_platform_is_silicon());
3006
3007         if (retry < 0)
3008                 gk20a_warn(dev_from_gk20a(g),
3009                         "mmu invalidate too many retries");
3010
3011 out:
3012         mutex_unlock(&tlb_lock);
3013         vm->tlb_dirty = false;
3014         mutex_unlock(&vm->update_gmmu_lock);
3015 }
3016
3017 int gk20a_mm_suspend(struct gk20a *g)
3018 {
3019         gk20a_dbg_fn("");
3020
3021         g->ops.ltc.elpg_flush(g);
3022
3023         gk20a_dbg_fn("done");
3024         return 0;
3025 }
3026
3027 void gk20a_mm_ltc_isr(struct gk20a *g)
3028 {
3029         u32 intr;
3030
3031         intr = gk20a_readl(g, ltc_ltc0_ltss_intr_r());
3032         gk20a_err(dev_from_gk20a(g), "ltc: %08x\n", intr);
3033         gk20a_writel(g, ltc_ltc0_ltss_intr_r(), intr);
3034 }
3035
3036 bool gk20a_mm_mmu_debug_mode_enabled(struct gk20a *g)
3037 {
3038         u32 debug_ctrl = gk20a_readl(g, fb_mmu_debug_ctrl_r());
3039         return fb_mmu_debug_ctrl_debug_v(debug_ctrl) ==
3040                 fb_mmu_debug_ctrl_debug_enabled_v();
3041 }