2 * GK20A memory management
4 * Copyright (c) 2011-2015, NVIDIA CORPORATION. All rights reserved.
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms and conditions of the GNU General Public License,
8 * version 2, as published by the Free Software Foundation.
10 * This program is distributed in the hope it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15 * You should have received a copy of the GNU General Public License
16 * along with this program. If not, see <http://www.gnu.org/licenses/>.
19 #include <linux/delay.h>
20 #include <linux/highmem.h>
21 #include <linux/log2.h>
22 #include <linux/nvhost.h>
23 #include <linux/pm_runtime.h>
24 #include <linux/scatterlist.h>
25 #include <linux/nvmap.h>
26 #include <linux/tegra-soc.h>
27 #include <linux/vmalloc.h>
28 #include <linux/dma-buf.h>
29 #include <uapi/linux/nvgpu.h>
30 #include <trace/events/gk20a.h>
34 #include "fence_gk20a.h"
35 #include "hw_gmmu_gk20a.h"
36 #include "hw_fb_gk20a.h"
37 #include "hw_bus_gk20a.h"
38 #include "hw_ram_gk20a.h"
39 #include "hw_mc_gk20a.h"
40 #include "hw_flush_gk20a.h"
41 #include "hw_ltc_gk20a.h"
43 #include "kind_gk20a.h"
44 #include "semaphore_gk20a.h"
47 * GPU mapping life cycle
48 * ======================
53 * Kernel mappings are created through vm.map(..., false):
55 * - Mappings to the same allocations are reused and refcounted.
56 * - This path does not support deferred unmapping (i.e. kernel must wait for
57 * all hw operations on the buffer to complete before unmapping).
58 * - References to dmabuf are owned and managed by the (kernel) clients of
65 * User space mappings are created through as.map_buffer -> vm.map(..., true):
67 * - Mappings to the same allocations are reused and refcounted.
68 * - This path supports deferred unmapping (i.e. we delay the actual unmapping
69 * until all hw operations have completed).
70 * - References to dmabuf are owned and managed by the vm_gk20a
71 * layer itself. vm.map acquires these refs, and sets
72 * mapped_buffer->own_mem_ref to record that we must release the refs when we
77 static inline int vm_aspace_id(struct vm_gk20a *vm)
79 /* -1 is bar1 or pmu, etc. */
80 return vm->as_share ? vm->as_share->id : -1;
82 static inline u32 hi32(u64 f)
84 return (u32)(f >> 32);
86 static inline u32 lo32(u64 f)
88 return (u32)(f & 0xffffffff);
91 static struct mapped_buffer_node *find_mapped_buffer_locked(
92 struct rb_root *root, u64 addr);
93 static struct mapped_buffer_node *find_mapped_buffer_reverse_locked(
94 struct rb_root *root, struct dma_buf *dmabuf,
96 static int update_gmmu_ptes_locked(struct vm_gk20a *vm,
97 enum gmmu_pgsz_gk20a pgsz_idx,
98 struct sg_table *sgt, u64 buffer_offset,
99 u64 first_vaddr, u64 last_vaddr,
100 u8 kind_v, u32 ctag_offset, bool cacheable,
101 bool umapped_pte, int rw_flag,
103 static int __must_check gk20a_init_system_vm(struct mm_gk20a *mm);
104 static int __must_check gk20a_init_bar1_vm(struct mm_gk20a *mm);
105 static int __must_check gk20a_init_hwpm(struct mm_gk20a *mm);
108 struct gk20a_dmabuf_priv {
111 struct gk20a_allocator *comptag_allocator;
112 struct gk20a_comptags comptags;
114 struct dma_buf_attachment *attach;
115 struct sg_table *sgt;
119 struct list_head states;
122 static void gk20a_vm_remove_support_nofree(struct vm_gk20a *vm);
124 static void gk20a_mm_delete_priv(void *_priv)
126 struct gk20a_buffer_state *s, *s_tmp;
127 struct gk20a_dmabuf_priv *priv = _priv;
131 if (priv->comptags.lines) {
132 BUG_ON(!priv->comptag_allocator);
133 priv->comptag_allocator->free(priv->comptag_allocator,
134 priv->comptags.offset,
135 priv->comptags.lines, 1);
138 /* Free buffer states */
139 list_for_each_entry_safe(s, s_tmp, &priv->states, list) {
140 gk20a_fence_put(s->fence);
148 struct sg_table *gk20a_mm_pin(struct device *dev, struct dma_buf *dmabuf)
150 struct gk20a_dmabuf_priv *priv;
152 priv = dma_buf_get_drvdata(dmabuf, dev);
154 return ERR_PTR(-EINVAL);
156 mutex_lock(&priv->lock);
158 if (priv->pin_count == 0) {
159 priv->attach = dma_buf_attach(dmabuf, dev);
160 if (IS_ERR(priv->attach)) {
161 mutex_unlock(&priv->lock);
162 return (struct sg_table *)priv->attach;
165 priv->sgt = dma_buf_map_attachment(priv->attach,
167 if (IS_ERR(priv->sgt)) {
168 dma_buf_detach(dmabuf, priv->attach);
169 mutex_unlock(&priv->lock);
175 mutex_unlock(&priv->lock);
179 void gk20a_mm_unpin(struct device *dev, struct dma_buf *dmabuf,
180 struct sg_table *sgt)
182 struct gk20a_dmabuf_priv *priv = dma_buf_get_drvdata(dmabuf, dev);
185 if (IS_ERR(priv) || !priv)
188 mutex_lock(&priv->lock);
189 WARN_ON(priv->sgt != sgt);
191 WARN_ON(priv->pin_count < 0);
192 dma_addr = sg_dma_address(priv->sgt->sgl);
193 if (priv->pin_count == 0) {
194 dma_buf_unmap_attachment(priv->attach, priv->sgt,
196 dma_buf_detach(dmabuf, priv->attach);
198 mutex_unlock(&priv->lock);
201 void gk20a_get_comptags(struct device *dev, struct dma_buf *dmabuf,
202 struct gk20a_comptags *comptags)
204 struct gk20a_dmabuf_priv *priv = dma_buf_get_drvdata(dmabuf, dev);
211 comptags->offset = 0;
215 *comptags = priv->comptags;
218 static int gk20a_alloc_comptags(struct device *dev,
219 struct dma_buf *dmabuf,
220 struct gk20a_allocator *allocator,
223 struct gk20a_dmabuf_priv *priv = dma_buf_get_drvdata(dmabuf, dev);
233 /* store the allocator so we can use it when we free the ctags */
234 priv->comptag_allocator = allocator;
235 err = allocator->alloc(allocator, &offset, lines, 1);
237 priv->comptags.lines = lines;
238 priv->comptags.offset = offset;
246 static int gk20a_init_mm_reset_enable_hw(struct gk20a *g)
252 if (g->ops.clock_gating.slcg_fb_load_gating_prod)
253 g->ops.clock_gating.slcg_fb_load_gating_prod(g,
255 if (g->ops.clock_gating.slcg_ltc_load_gating_prod)
256 g->ops.clock_gating.slcg_ltc_load_gating_prod(g,
258 if (g->ops.clock_gating.blcg_fb_load_gating_prod)
259 g->ops.clock_gating.blcg_fb_load_gating_prod(g,
261 if (g->ops.clock_gating.blcg_ltc_load_gating_prod)
262 g->ops.clock_gating.blcg_ltc_load_gating_prod(g,
265 if (g->ops.fb.init_fs_state)
266 g->ops.fb.init_fs_state(g);
271 static void gk20a_remove_vm(struct vm_gk20a *vm, struct mem_desc *inst_block)
273 struct gk20a *g = vm->mm->g;
277 gk20a_free_inst_block(g, inst_block);
278 gk20a_vm_remove_support_nofree(vm);
281 static void gk20a_remove_mm_support(struct mm_gk20a *mm)
283 gk20a_remove_vm(&mm->bar1.vm, &mm->bar1.inst_block);
284 gk20a_remove_vm(&mm->pmu.vm, &mm->pmu.inst_block);
285 gk20a_free_inst_block(gk20a_from_mm(mm), &mm->hwpm.inst_block);
288 int gk20a_init_mm_setup_sw(struct gk20a *g)
290 struct mm_gk20a *mm = &g->mm;
296 gk20a_dbg_fn("skip init");
301 mutex_init(&mm->l2_op_lock);
303 /*TBD: make channel vm size configurable */
304 mm->channel.size = 1ULL << NV_GMMU_VA_RANGE;
306 gk20a_dbg_info("channel vm size: %dMB", (int)(mm->channel.size >> 20));
308 err = gk20a_init_bar1_vm(mm);
312 if (g->ops.mm.init_bar2_vm) {
313 err = g->ops.mm.init_bar2_vm(g);
317 err = gk20a_init_system_vm(mm);
321 err = gk20a_init_hwpm(mm);
325 /* set vm_alloc_share op here as gk20a_as_alloc_share needs it */
326 g->ops.mm.vm_alloc_share = gk20a_vm_alloc_share;
327 mm->remove_support = gk20a_remove_mm_support;
330 gk20a_dbg_fn("done");
334 /* make sure gk20a_init_mm_support is called before */
335 int gk20a_init_mm_setup_hw(struct gk20a *g)
337 struct mm_gk20a *mm = &g->mm;
338 struct mem_desc *inst_block = &mm->bar1.inst_block;
339 phys_addr_t inst_pa = gk20a_mem_phys(inst_block);
344 g->ops.fb.set_mmu_page_size(g);
346 inst_pa = (u32)(inst_pa >> bar1_instance_block_shift_gk20a());
347 gk20a_dbg_info("bar1 inst block ptr: 0x%08x", (u32)inst_pa);
349 gk20a_writel(g, bus_bar1_block_r(),
350 bus_bar1_block_target_vid_mem_f() |
351 bus_bar1_block_mode_virtual_f() |
352 bus_bar1_block_ptr_f(inst_pa));
354 if (g->ops.mm.init_bar2_mm_hw_setup) {
355 err = g->ops.mm.init_bar2_mm_hw_setup(g);
360 if (gk20a_mm_fb_flush(g) || gk20a_mm_fb_flush(g))
363 gk20a_dbg_fn("done");
367 int gk20a_init_mm_support(struct gk20a *g)
371 err = gk20a_init_mm_reset_enable_hw(g);
375 err = gk20a_init_mm_setup_sw(g);
379 if (g->ops.mm.init_mm_setup_hw)
380 err = g->ops.mm.init_mm_setup_hw(g);
385 static int alloc_gmmu_phys_pages(struct vm_gk20a *vm, u32 order,
386 struct gk20a_mm_entry *entry)
388 u32 num_pages = 1 << order;
389 u32 len = num_pages * PAGE_SIZE;
395 pages = alloc_pages(GFP_KERNEL, order);
397 gk20a_dbg(gpu_dbg_pte, "alloc_pages failed\n");
400 entry->sgt = kzalloc(sizeof(*entry->sgt), GFP_KERNEL);
402 gk20a_dbg(gpu_dbg_pte, "cannot allocate sg table");
405 err = sg_alloc_table(entry->sgt, 1, GFP_KERNEL);
407 gk20a_dbg(gpu_dbg_pte, "sg_alloc_table failed\n");
410 sg_set_page(entry->sgt->sgl, pages, len, 0);
411 entry->cpu_va = page_address(pages);
412 memset(entry->cpu_va, 0, len);
414 FLUSH_CPU_DCACHE(entry->cpu_va, sg_phys(entry->sgt->sgl), len);
421 __free_pages(pages, order);
426 static void free_gmmu_phys_pages(struct vm_gk20a *vm,
427 struct gk20a_mm_entry *entry)
430 free_pages((unsigned long)entry->cpu_va, get_order(entry->size));
431 entry->cpu_va = NULL;
433 sg_free_table(entry->sgt);
438 static int map_gmmu_phys_pages(struct gk20a_mm_entry *entry)
440 FLUSH_CPU_DCACHE(entry->cpu_va,
441 sg_phys(entry->sgt->sgl),
442 entry->sgt->sgl->length);
446 static void unmap_gmmu_phys_pages(struct gk20a_mm_entry *entry)
448 FLUSH_CPU_DCACHE(entry->cpu_va,
449 sg_phys(entry->sgt->sgl),
450 entry->sgt->sgl->length);
453 static int alloc_gmmu_pages(struct vm_gk20a *vm, u32 order,
454 struct gk20a_mm_entry *entry)
456 struct device *d = dev_from_vm(vm);
457 u32 num_pages = 1 << order;
458 u32 len = num_pages * PAGE_SIZE;
460 DEFINE_DMA_ATTRS(attrs);
466 if (tegra_platform_is_linsim())
467 return alloc_gmmu_phys_pages(vm, order, entry);
472 * On arm32 we're limited by vmalloc space, so we do not map pages by
475 if (IS_ENABLED(CONFIG_ARM64)) {
476 cpuva = dma_zalloc_coherent(d, len, &iova, GFP_KERNEL);
478 gk20a_err(d, "memory allocation failed\n");
482 err = gk20a_get_sgtable(d, &entry->sgt, cpuva, iova, len);
484 gk20a_err(d, "sgt allocation failed\n");
488 entry->cpu_va = cpuva;
492 dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
493 pages = dma_alloc_attrs(d, len, &iova, GFP_KERNEL, &attrs);
495 gk20a_err(d, "memory allocation failed\n");
499 err = gk20a_get_sgtable_from_pages(d, &entry->sgt, pages,
502 gk20a_err(d, "sgt allocation failed\n");
506 entry->pages = pages;
512 if (IS_ENABLED(CONFIG_ARM64)) {
513 dma_free_coherent(d, len, entry->cpu_va, iova);
516 dma_free_attrs(d, len, entry->pages, iova, &attrs);
524 void free_gmmu_pages(struct vm_gk20a *vm,
525 struct gk20a_mm_entry *entry)
527 struct device *d = dev_from_vm(vm);
529 DEFINE_DMA_ATTRS(attrs);
535 if (tegra_platform_is_linsim()) {
536 free_gmmu_phys_pages(vm, entry);
540 iova = sg_dma_address(entry->sgt->sgl);
542 gk20a_free_sgtable(&entry->sgt);
545 * On arm32 we're limited by vmalloc space, so we do not map pages by
548 if (IS_ENABLED(CONFIG_ARM64)) {
549 dma_free_coherent(d, entry->size, entry->cpu_va, iova);
550 entry->cpu_va = NULL;
552 dma_set_attr(DMA_ATTR_NO_KERNEL_MAPPING, &attrs);
553 dma_free_attrs(d, entry->size, entry->pages, iova, &attrs);
559 int map_gmmu_pages(struct gk20a_mm_entry *entry)
561 int count = PAGE_ALIGN(entry->size) >> PAGE_SHIFT;
565 if (tegra_platform_is_linsim())
566 return map_gmmu_phys_pages(entry);
568 if (IS_ENABLED(CONFIG_ARM64)) {
569 FLUSH_CPU_DCACHE(entry->cpu_va,
570 sg_phys(entry->sgt->sgl),
573 pages = entry->pages;
574 entry->cpu_va = vmap(pages, count, 0,
575 pgprot_dmacoherent(PAGE_KERNEL));
583 void unmap_gmmu_pages(struct gk20a_mm_entry *entry)
587 if (tegra_platform_is_linsim()) {
588 unmap_gmmu_phys_pages(entry);
592 if (IS_ENABLED(CONFIG_ARM64)) {
593 FLUSH_CPU_DCACHE(entry->cpu_va,
594 sg_phys(entry->sgt->sgl),
597 vunmap(entry->cpu_va);
598 entry->cpu_va = NULL;
602 /* allocate a phys contig region big enough for a full
603 * sized gmmu page table for the given gmmu_page_size.
604 * the whole range is zeroed so it's "invalid"/will fault
607 static int gk20a_zalloc_gmmu_page_table(struct vm_gk20a *vm,
608 enum gmmu_pgsz_gk20a pgsz_idx,
609 const struct gk20a_mmu_level *l,
610 struct gk20a_mm_entry *entry)
617 /* allocate enough pages for the table */
618 order = l->hi_bit[pgsz_idx] - l->lo_bit[pgsz_idx] + 1;
619 order += ilog2(l->entry_size);
621 order = max(0, order);
623 err = alloc_gmmu_pages(vm, order, entry);
624 gk20a_dbg(gpu_dbg_pte, "entry = 0x%p, addr=%08llx, size %d",
625 entry, gk20a_mm_iova_addr(vm->mm->g, entry->sgt->sgl), order);
628 entry->pgsz = pgsz_idx;
633 int gk20a_mm_pde_coverage_bit_count(struct vm_gk20a *vm)
635 return vm->mmu_levels[0].lo_bit[0];
638 /* given address range (inclusive) determine the pdes crossed */
639 void pde_range_from_vaddr_range(struct vm_gk20a *vm,
640 u64 addr_lo, u64 addr_hi,
641 u32 *pde_lo, u32 *pde_hi)
643 int pde_shift = gk20a_mm_pde_coverage_bit_count(vm);
645 *pde_lo = (u32)(addr_lo >> pde_shift);
646 *pde_hi = (u32)(addr_hi >> pde_shift);
647 gk20a_dbg(gpu_dbg_pte, "addr_lo=0x%llx addr_hi=0x%llx pde_ss=%d",
648 addr_lo, addr_hi, pde_shift);
649 gk20a_dbg(gpu_dbg_pte, "pde_lo=%d pde_hi=%d",
653 u32 *pde_from_index(struct vm_gk20a *vm, u32 i)
655 return (u32 *) (((u8 *)vm->pdb.cpu_va) + i*gmmu_pde__size_v());
658 u32 pte_index_from_vaddr(struct vm_gk20a *vm,
659 u64 addr, enum gmmu_pgsz_gk20a pgsz_idx)
662 /* mask off pde part */
663 addr = addr & ((1ULL << gk20a_mm_pde_coverage_bit_count(vm)) - 1ULL);
665 /* shift over to get pte index. note assumption that pte index
666 * doesn't leak over into the high 32b */
667 ret = (u32)(addr >> ilog2(vm->gmmu_page_sizes[pgsz_idx]));
669 gk20a_dbg(gpu_dbg_pte, "addr=0x%llx pte_i=0x%x", addr, ret);
673 static struct vm_reserved_va_node *addr_to_reservation(struct vm_gk20a *vm,
676 struct vm_reserved_va_node *va_node;
677 list_for_each_entry(va_node, &vm->reserved_va_list, reserved_va_list)
678 if (addr >= va_node->vaddr_start &&
679 addr < (u64)va_node->vaddr_start + (u64)va_node->size)
685 int gk20a_vm_get_buffers(struct vm_gk20a *vm,
686 struct mapped_buffer_node ***mapped_buffers,
689 struct mapped_buffer_node *mapped_buffer;
690 struct mapped_buffer_node **buffer_list;
691 struct rb_node *node;
694 mutex_lock(&vm->update_gmmu_lock);
696 buffer_list = nvgpu_alloc(sizeof(*buffer_list) *
697 vm->num_user_mapped_buffers, true);
699 mutex_unlock(&vm->update_gmmu_lock);
703 node = rb_first(&vm->mapped_buffers);
706 container_of(node, struct mapped_buffer_node, node);
707 if (mapped_buffer->user_mapped) {
708 buffer_list[i] = mapped_buffer;
709 kref_get(&mapped_buffer->ref);
712 node = rb_next(&mapped_buffer->node);
715 BUG_ON(i != vm->num_user_mapped_buffers);
717 *num_buffers = vm->num_user_mapped_buffers;
718 *mapped_buffers = buffer_list;
720 mutex_unlock(&vm->update_gmmu_lock);
725 static void gk20a_vm_unmap_locked_kref(struct kref *ref)
727 struct mapped_buffer_node *mapped_buffer =
728 container_of(ref, struct mapped_buffer_node, ref);
729 gk20a_vm_unmap_locked(mapped_buffer);
732 void gk20a_vm_put_buffers(struct vm_gk20a *vm,
733 struct mapped_buffer_node **mapped_buffers,
738 mutex_lock(&vm->update_gmmu_lock);
740 for (i = 0; i < num_buffers; ++i)
741 kref_put(&mapped_buffers[i]->ref,
742 gk20a_vm_unmap_locked_kref);
744 mutex_unlock(&vm->update_gmmu_lock);
746 nvgpu_free(mapped_buffers);
749 static void gk20a_vm_unmap_user(struct vm_gk20a *vm, u64 offset)
751 struct device *d = dev_from_vm(vm);
753 struct mapped_buffer_node *mapped_buffer;
755 mutex_lock(&vm->update_gmmu_lock);
757 mapped_buffer = find_mapped_buffer_locked(&vm->mapped_buffers, offset);
758 if (!mapped_buffer) {
759 mutex_unlock(&vm->update_gmmu_lock);
760 gk20a_err(d, "invalid addr to unmap 0x%llx", offset);
764 if (mapped_buffer->flags & NVGPU_AS_MAP_BUFFER_FLAGS_FIXED_OFFSET) {
765 mutex_unlock(&vm->update_gmmu_lock);
767 if (tegra_platform_is_silicon())
772 if (atomic_read(&mapped_buffer->ref.refcount) == 1)
778 gk20a_err(d, "sync-unmap failed on 0x%llx",
780 mutex_lock(&vm->update_gmmu_lock);
783 mapped_buffer->user_mapped--;
784 if (mapped_buffer->user_mapped == 0)
785 vm->num_user_mapped_buffers--;
786 kref_put(&mapped_buffer->ref, gk20a_vm_unmap_locked_kref);
788 mutex_unlock(&vm->update_gmmu_lock);
791 u64 gk20a_vm_alloc_va(struct vm_gk20a *vm,
793 enum gmmu_pgsz_gk20a gmmu_pgsz_idx)
796 struct gk20a_allocator *vma = &vm->vma[gmmu_pgsz_idx];
799 u32 start_page_nr = 0, num_pages;
800 u64 gmmu_page_size = vm->gmmu_page_sizes[gmmu_pgsz_idx];
802 if (gmmu_pgsz_idx >= gmmu_nr_page_sizes) {
803 dev_warn(dev_from_vm(vm),
804 "invalid page size requested in gk20a vm alloc");
808 if ((gmmu_pgsz_idx == gmmu_page_size_big) && !vm->big_pages) {
809 dev_warn(dev_from_vm(vm),
810 "unsupportd page size requested");
815 /* be certain we round up to gmmu_page_size if needed */
816 /* TBD: DIV_ROUND_UP -> undefined reference to __aeabi_uldivmod */
817 size = (size + ((u64)gmmu_page_size - 1)) & ~((u64)gmmu_page_size - 1);
819 gk20a_dbg_info("size=0x%llx @ pgsz=%dKB", size,
820 vm->gmmu_page_sizes[gmmu_pgsz_idx]>>10);
822 /* The vma allocator represents page accounting. */
823 num_pages = size >> ilog2(vm->gmmu_page_sizes[gmmu_pgsz_idx]);
825 err = vma->alloc(vma, &start_page_nr, num_pages, 1);
828 gk20a_err(dev_from_vm(vm),
829 "%s oom: sz=0x%llx", vma->name, size);
833 offset = (u64)start_page_nr <<
834 ilog2(vm->gmmu_page_sizes[gmmu_pgsz_idx]);
835 gk20a_dbg_fn("%s found addr: 0x%llx", vma->name, offset);
840 int gk20a_vm_free_va(struct vm_gk20a *vm,
841 u64 offset, u64 size,
842 enum gmmu_pgsz_gk20a pgsz_idx)
844 struct gk20a_allocator *vma = &vm->vma[pgsz_idx];
845 u32 page_size = vm->gmmu_page_sizes[pgsz_idx];
846 u32 page_shift = ilog2(page_size);
847 u32 start_page_nr, num_pages;
850 gk20a_dbg_info("%s free addr=0x%llx, size=0x%llx",
851 vma->name, offset, size);
853 start_page_nr = (u32)(offset >> page_shift);
854 num_pages = (u32)((size + page_size - 1) >> page_shift);
856 err = vma->free(vma, start_page_nr, num_pages, 1);
858 gk20a_err(dev_from_vm(vm),
859 "not found: offset=0x%llx, sz=0x%llx",
866 static int insert_mapped_buffer(struct rb_root *root,
867 struct mapped_buffer_node *mapped_buffer)
869 struct rb_node **new_node = &(root->rb_node), *parent = NULL;
871 /* Figure out where to put new node */
873 struct mapped_buffer_node *cmp_with =
874 container_of(*new_node, struct mapped_buffer_node,
879 if (cmp_with->addr > mapped_buffer->addr) /* u64 cmp */
880 new_node = &((*new_node)->rb_left);
881 else if (cmp_with->addr != mapped_buffer->addr) /* u64 cmp */
882 new_node = &((*new_node)->rb_right);
884 return -EINVAL; /* no fair dup'ing */
887 /* Add new node and rebalance tree. */
888 rb_link_node(&mapped_buffer->node, parent, new_node);
889 rb_insert_color(&mapped_buffer->node, root);
894 static struct mapped_buffer_node *find_mapped_buffer_reverse_locked(
895 struct rb_root *root, struct dma_buf *dmabuf,
898 struct rb_node *node = rb_first(root);
900 struct mapped_buffer_node *mapped_buffer =
901 container_of(node, struct mapped_buffer_node, node);
902 if (mapped_buffer->dmabuf == dmabuf &&
903 kind == mapped_buffer->kind)
904 return mapped_buffer;
905 node = rb_next(&mapped_buffer->node);
910 static struct mapped_buffer_node *find_mapped_buffer_locked(
911 struct rb_root *root, u64 addr)
914 struct rb_node *node = root->rb_node;
916 struct mapped_buffer_node *mapped_buffer =
917 container_of(node, struct mapped_buffer_node, node);
918 if (mapped_buffer->addr > addr) /* u64 cmp */
919 node = node->rb_left;
920 else if (mapped_buffer->addr != addr) /* u64 cmp */
921 node = node->rb_right;
923 return mapped_buffer;
928 static struct mapped_buffer_node *find_mapped_buffer_range_locked(
929 struct rb_root *root, u64 addr)
931 struct rb_node *node = root->rb_node;
933 struct mapped_buffer_node *m =
934 container_of(node, struct mapped_buffer_node, node);
935 if (m->addr <= addr && m->addr + m->size > addr)
937 else if (m->addr > addr) /* u64 cmp */
938 node = node->rb_left;
940 node = node->rb_right;
945 #define BFR_ATTRS (sizeof(nvmap_bfr_param)/sizeof(nvmap_bfr_param[0]))
947 struct buffer_attrs {
948 struct sg_table *sgt;
958 static void gmmu_select_page_size(struct vm_gk20a *vm,
959 struct buffer_attrs *bfr)
962 /* choose the biggest first (top->bottom) */
963 for (i = gmmu_nr_page_sizes-1; i >= 0; i--)
964 if (!((vm->gmmu_page_sizes[i] - 1) & bfr->align)) {
970 static int setup_buffer_kind_and_compression(struct vm_gk20a *vm,
972 struct buffer_attrs *bfr,
973 enum gmmu_pgsz_gk20a pgsz_idx)
975 bool kind_compressible;
976 struct gk20a *g = gk20a_from_vm(vm);
977 struct device *d = dev_from_gk20a(g);
978 int ctag_granularity = g->ops.fb.compression_page_size(g);
980 if (unlikely(bfr->kind_v == gmmu_pte_kind_invalid_v()))
981 bfr->kind_v = gmmu_pte_kind_pitch_v();
983 if (unlikely(!gk20a_kind_is_supported(bfr->kind_v))) {
984 gk20a_err(d, "kind 0x%x not supported", bfr->kind_v);
988 bfr->uc_kind_v = gmmu_pte_kind_invalid_v();
989 /* find a suitable uncompressed kind if it becomes necessary later */
990 kind_compressible = gk20a_kind_is_compressible(bfr->kind_v);
991 if (kind_compressible) {
992 bfr->uc_kind_v = gk20a_get_uncompressed_kind(bfr->kind_v);
993 if (unlikely(bfr->uc_kind_v == gmmu_pte_kind_invalid_v())) {
994 /* shouldn't happen, but it is worth cross-checking */
995 gk20a_err(d, "comptag kind 0x%x can't be"
996 " downgraded to uncompressed kind",
1001 /* comptags only supported for suitable kinds, 128KB pagesize */
1002 if (unlikely(kind_compressible &&
1003 (vm->gmmu_page_sizes[pgsz_idx] != vm->big_page_size))) {
1005 gk20a_warn(d, "comptags specified"
1006 " but pagesize being used doesn't support it");*/
1007 /* it is safe to fall back to uncompressed as
1008 functionality is not harmed */
1009 bfr->kind_v = bfr->uc_kind_v;
1010 kind_compressible = false;
1012 if (kind_compressible)
1013 bfr->ctag_lines = DIV_ROUND_UP_ULL(bfr->size, ctag_granularity);
1015 bfr->ctag_lines = 0;
1020 static int validate_fixed_buffer(struct vm_gk20a *vm,
1021 struct buffer_attrs *bfr,
1022 u64 map_offset, u64 map_size)
1024 struct device *dev = dev_from_vm(vm);
1025 struct vm_reserved_va_node *va_node;
1026 struct mapped_buffer_node *buffer;
1027 u64 map_end = map_offset + map_size;
1029 /* can wrap around with insane map_size; zero is disallowed too */
1030 if (map_end <= map_offset) {
1031 gk20a_warn(dev, "fixed offset mapping with invalid map_size");
1035 if (map_offset & (vm->gmmu_page_sizes[bfr->pgsz_idx] - 1)) {
1036 gk20a_err(dev, "map offset must be buffer page size aligned 0x%llx",
1041 /* find the space reservation */
1042 va_node = addr_to_reservation(vm, map_offset);
1044 gk20a_warn(dev, "fixed offset mapping without space allocation");
1048 /* mapped area should fit inside va */
1049 if (map_end > va_node->vaddr_start + va_node->size) {
1050 gk20a_warn(dev, "fixed offset mapping size overflows va node");
1054 /* check that this mappings does not collide with existing
1055 * mappings by checking the overlapping area between the current
1056 * buffer and all other mapped buffers */
1058 list_for_each_entry(buffer,
1059 &va_node->va_buffers_list, va_buffers_list) {
1060 s64 begin = max(buffer->addr, map_offset);
1061 s64 end = min(buffer->addr +
1062 buffer->size, map_offset + map_size);
1063 if (end - begin > 0) {
1064 gk20a_warn(dev, "overlapping buffer map requested");
1072 u64 gk20a_locked_gmmu_map(struct vm_gk20a *vm,
1074 struct sg_table *sgt,
1086 bool allocated = false;
1087 struct device *d = dev_from_vm(vm);
1088 struct gk20a *g = gk20a_from_vm(vm);
1089 int ctag_granularity = g->ops.fb.compression_page_size(g);
1091 if (clear_ctags && ctag_offset) {
1092 u32 ctag_lines = DIV_ROUND_UP_ULL(size, ctag_granularity);
1094 /* init/clear the ctag buffer */
1095 g->ops.ltc.cbc_ctrl(g, gk20a_cbc_op_clear,
1096 ctag_offset, ctag_offset + ctag_lines - 1);
1099 /* Allocate (or validate when map_offset != 0) the virtual address. */
1101 map_offset = gk20a_vm_alloc_va(vm, size,
1104 gk20a_err(d, "failed to allocate va space");
1111 err = update_gmmu_ptes_locked(vm, pgsz_idx,
1114 map_offset, map_offset + size,
1118 NVGPU_MAP_BUFFER_FLAGS_CACHEABLE_TRUE,
1120 NVGPU_GPU_FLAGS_SUPPORT_UNMAPPED_PTE,
1124 gk20a_err(d, "failed to update ptes on map");
1128 g->ops.mm.tlb_invalidate(vm);
1133 gk20a_vm_free_va(vm, map_offset, size, pgsz_idx);
1135 gk20a_err(d, "%s: failed with err=%d\n", __func__, err);
1139 void gk20a_locked_gmmu_unmap(struct vm_gk20a *vm,
1148 struct gk20a *g = gk20a_from_vm(vm);
1151 err = gk20a_vm_free_va(vm, vaddr, size, pgsz_idx);
1153 dev_err(dev_from_vm(vm),
1154 "failed to free va");
1159 /* unmap here needs to know the page size we assigned at mapping */
1160 err = update_gmmu_ptes_locked(vm,
1162 NULL, /* n/a for unmap */
1166 0, 0, false /* n/a for unmap */,
1170 dev_err(dev_from_vm(vm),
1171 "failed to update gmmu ptes on unmap");
1173 /* flush l2 so any dirty lines are written out *now*.
1174 * also as we could potentially be switching this buffer
1175 * from nonvolatile (l2 cacheable) to volatile (l2 non-cacheable) at
1176 * some point in the future we need to invalidate l2. e.g. switching
1177 * from a render buffer unmap (here) to later using the same memory
1178 * for gmmu ptes. note the positioning of this relative to any smmu
1179 * unmapping (below). */
1181 gk20a_mm_l2_flush(g, true);
1183 g->ops.mm.tlb_invalidate(vm);
1186 static u64 gk20a_vm_map_duplicate_locked(struct vm_gk20a *vm,
1187 struct dma_buf *dmabuf,
1191 struct sg_table **sgt,
1195 struct mapped_buffer_node *mapped_buffer = NULL;
1198 find_mapped_buffer_reverse_locked(&vm->mapped_buffers,
1203 if (mapped_buffer->flags != flags)
1206 if (flags & NVGPU_AS_MAP_BUFFER_FLAGS_FIXED_OFFSET &&
1207 mapped_buffer->addr != offset_align)
1210 BUG_ON(mapped_buffer->vm != vm);
1212 /* mark the buffer as used */
1214 if (mapped_buffer->user_mapped == 0)
1215 vm->num_user_mapped_buffers++;
1216 mapped_buffer->user_mapped++;
1218 /* If the mapping comes from user space, we own
1219 * the handle ref. Since we reuse an
1220 * existing mapping here, we need to give back those
1221 * refs once in order not to leak.
1223 if (mapped_buffer->own_mem_ref)
1224 dma_buf_put(mapped_buffer->dmabuf);
1226 mapped_buffer->own_mem_ref = true;
1228 kref_get(&mapped_buffer->ref);
1230 gk20a_dbg(gpu_dbg_map,
1231 "reusing as=%d pgsz=%d flags=0x%x ctags=%d "
1232 "start=%d gv=0x%x,%08x -> 0x%x,%08x -> 0x%x,%08x "
1233 "own_mem_ref=%d user_mapped=%d",
1234 vm_aspace_id(vm), mapped_buffer->pgsz_idx,
1235 mapped_buffer->flags,
1236 mapped_buffer->ctag_lines,
1237 mapped_buffer->ctag_offset,
1238 hi32(mapped_buffer->addr), lo32(mapped_buffer->addr),
1239 hi32((u64)sg_dma_address(mapped_buffer->sgt->sgl)),
1240 lo32((u64)sg_dma_address(mapped_buffer->sgt->sgl)),
1241 hi32((u64)sg_phys(mapped_buffer->sgt->sgl)),
1242 lo32((u64)sg_phys(mapped_buffer->sgt->sgl)),
1243 mapped_buffer->own_mem_ref, user_mapped);
1246 *sgt = mapped_buffer->sgt;
1247 return mapped_buffer->addr;
1250 u64 gk20a_vm_map(struct vm_gk20a *vm,
1251 struct dma_buf *dmabuf,
1253 u32 flags /*NVGPU_AS_MAP_BUFFER_FLAGS_*/,
1255 struct sg_table **sgt,
1261 struct gk20a *g = gk20a_from_vm(vm);
1262 struct gk20a_allocator *ctag_allocator = &g->gr.comp_tags;
1263 struct device *d = dev_from_vm(vm);
1264 struct mapped_buffer_node *mapped_buffer = NULL;
1265 bool inserted = false, va_allocated = false;
1266 u32 gmmu_page_size = 0;
1269 struct buffer_attrs bfr = {NULL};
1270 struct gk20a_comptags comptags;
1272 bool clear_ctags = false;
1274 mutex_lock(&vm->update_gmmu_lock);
1276 /* check if this buffer is already mapped */
1277 map_offset = gk20a_vm_map_duplicate_locked(vm, dmabuf, offset_align,
1279 user_mapped, rw_flag);
1281 mutex_unlock(&vm->update_gmmu_lock);
1285 /* pin buffer to get phys/iovmm addr */
1286 bfr.sgt = gk20a_mm_pin(d, dmabuf);
1287 if (IS_ERR(bfr.sgt)) {
1288 /* Falling back to physical is actually possible
1289 * here in many cases if we use 4K phys pages in the
1290 * gmmu. However we have some regions which require
1291 * contig regions to work properly (either phys-contig
1292 * or contig through smmu io_vaspace). Until we can
1293 * track the difference between those two cases we have
1294 * to fail the mapping when we run out of SMMU space.
1296 gk20a_warn(d, "oom allocating tracking buffer");
1304 bfr.size = dmabuf->size;
1305 buf_addr = (u64)sg_dma_address(bfr.sgt->sgl);
1306 if (unlikely(!buf_addr))
1307 buf_addr = (u64)sg_phys(bfr.sgt->sgl);
1308 bfr.align = 1 << __ffs(buf_addr);
1310 mapping_size = mapping_size ? mapping_size : bfr.size;
1312 /* If FIX_OFFSET is set, pgsz is determined. Otherwise, select
1313 * page size according to memory alignment */
1314 if (flags & NVGPU_AS_MAP_BUFFER_FLAGS_FIXED_OFFSET) {
1315 bfr.pgsz_idx = NV_GMMU_VA_IS_UPPER(offset_align) ?
1316 gmmu_page_size_big : gmmu_page_size_small;
1319 gmmu_select_page_size(vm, &bfr);
1321 bfr.pgsz_idx = gmmu_page_size_small;
1324 /* validate/adjust bfr attributes */
1325 if (unlikely(bfr.pgsz_idx == -1)) {
1326 gk20a_err(d, "unsupported page size detected");
1330 if (unlikely(bfr.pgsz_idx < gmmu_page_size_small ||
1331 bfr.pgsz_idx > gmmu_page_size_big)) {
1336 gmmu_page_size = vm->gmmu_page_sizes[bfr.pgsz_idx];
1338 /* Check if we should use a fixed offset for mapping this buffer */
1340 if (flags & NVGPU_AS_MAP_BUFFER_FLAGS_FIXED_OFFSET) {
1341 err = validate_fixed_buffer(vm, &bfr,
1342 offset_align, mapping_size);
1346 map_offset = offset_align;
1347 va_allocated = false;
1349 va_allocated = true;
1354 err = setup_buffer_kind_and_compression(vm, flags, &bfr, bfr.pgsz_idx);
1355 if (unlikely(err)) {
1356 gk20a_err(d, "failure setting up kind and compression");
1360 /* bar1 and pmu vm don't need ctag */
1361 if (!vm->enable_ctag)
1364 gk20a_get_comptags(d, dmabuf, &comptags);
1366 if (bfr.ctag_lines && !comptags.lines) {
1367 /* allocate compression resources if needed */
1368 err = gk20a_alloc_comptags(d, dmabuf, ctag_allocator,
1371 /* ok to fall back here if we ran out */
1372 /* TBD: we can partially alloc ctags as well... */
1373 bfr.ctag_lines = bfr.ctag_offset = 0;
1374 bfr.kind_v = bfr.uc_kind_v;
1376 gk20a_get_comptags(d, dmabuf, &comptags);
1381 /* store the comptag info */
1382 bfr.ctag_offset = comptags.offset;
1384 /* update gmmu ptes */
1385 map_offset = g->ops.mm.gmmu_map(vm, map_offset,
1387 buffer_offset, /* sg offset */
1398 gk20a_dbg(gpu_dbg_map,
1400 "kind=0x%x kind_uc=0x%x flags=0x%x "
1401 "ctags=%d start=%d gv=0x%x,%08x -> 0x%x,%08x -> 0x%x,%08x",
1402 vm_aspace_id(vm), gmmu_page_size,
1403 bfr.kind_v, bfr.uc_kind_v, flags,
1404 bfr.ctag_lines, bfr.ctag_offset,
1405 hi32(map_offset), lo32(map_offset),
1406 hi32((u64)sg_dma_address(bfr.sgt->sgl)),
1407 lo32((u64)sg_dma_address(bfr.sgt->sgl)),
1408 hi32((u64)sg_phys(bfr.sgt->sgl)),
1409 lo32((u64)sg_phys(bfr.sgt->sgl)));
1411 #if defined(NVHOST_DEBUG)
1414 struct scatterlist *sg = NULL;
1415 gk20a_dbg(gpu_dbg_pte, "for_each_sg(bfr.sgt->sgl, sg, bfr.sgt->nents, i)");
1416 for_each_sg(bfr.sgt->sgl, sg, bfr.sgt->nents, i ) {
1417 u64 da = sg_dma_address(sg);
1418 u64 pa = sg_phys(sg);
1419 u64 len = sg->length;
1420 gk20a_dbg(gpu_dbg_pte, "i=%d pa=0x%x,%08x da=0x%x,%08x len=0x%x,%08x",
1421 i, hi32(pa), lo32(pa), hi32(da), lo32(da),
1422 hi32(len), lo32(len));
1427 /* keep track of the buffer for unmapping */
1428 /* TBD: check for multiple mapping of same buffer */
1429 mapped_buffer = kzalloc(sizeof(*mapped_buffer), GFP_KERNEL);
1430 if (!mapped_buffer) {
1431 gk20a_warn(d, "oom allocating tracking buffer");
1434 mapped_buffer->dmabuf = dmabuf;
1435 mapped_buffer->sgt = bfr.sgt;
1436 mapped_buffer->addr = map_offset;
1437 mapped_buffer->size = mapping_size;
1438 mapped_buffer->pgsz_idx = bfr.pgsz_idx;
1439 mapped_buffer->ctag_offset = bfr.ctag_offset;
1440 mapped_buffer->ctag_lines = bfr.ctag_lines;
1441 mapped_buffer->vm = vm;
1442 mapped_buffer->flags = flags;
1443 mapped_buffer->kind = kind;
1444 mapped_buffer->va_allocated = va_allocated;
1445 mapped_buffer->user_mapped = user_mapped ? 1 : 0;
1446 mapped_buffer->own_mem_ref = user_mapped;
1447 INIT_LIST_HEAD(&mapped_buffer->unmap_list);
1448 INIT_LIST_HEAD(&mapped_buffer->va_buffers_list);
1449 kref_init(&mapped_buffer->ref);
1451 err = insert_mapped_buffer(&vm->mapped_buffers, mapped_buffer);
1453 gk20a_err(d, "failed to insert into mapped buffer tree");
1458 vm->num_user_mapped_buffers++;
1460 gk20a_dbg_info("allocated va @ 0x%llx", map_offset);
1462 if (!va_allocated) {
1463 struct vm_reserved_va_node *va_node;
1465 /* find the space reservation */
1466 va_node = addr_to_reservation(vm, map_offset);
1467 list_add_tail(&mapped_buffer->va_buffers_list,
1468 &va_node->va_buffers_list);
1469 mapped_buffer->va_node = va_node;
1472 mutex_unlock(&vm->update_gmmu_lock);
1478 rb_erase(&mapped_buffer->node, &vm->mapped_buffers);
1480 vm->num_user_mapped_buffers--;
1482 kfree(mapped_buffer);
1484 gk20a_vm_free_va(vm, map_offset, bfr.size, bfr.pgsz_idx);
1485 if (!IS_ERR(bfr.sgt))
1486 gk20a_mm_unpin(d, dmabuf, bfr.sgt);
1488 mutex_unlock(&vm->update_gmmu_lock);
1489 gk20a_dbg_info("err=%d\n", err);
1493 u64 gk20a_gmmu_map(struct vm_gk20a *vm,
1494 struct sg_table **sgt,
1499 struct gk20a *g = gk20a_from_vm(vm);
1502 mutex_lock(&vm->update_gmmu_lock);
1503 vaddr = g->ops.mm.gmmu_map(vm, 0, /* already mapped? - No */
1504 *sgt, /* sg table */
1507 0, /* page size index = 0 i.e. SZ_4K */
1509 0, /* ctag_offset */
1510 flags, rw_flag, false, false);
1511 mutex_unlock(&vm->update_gmmu_lock);
1513 gk20a_err(dev_from_vm(vm), "failed to allocate va space");
1520 int gk20a_gmmu_alloc(struct gk20a *g, size_t size, struct mem_desc *mem)
1522 return gk20a_gmmu_alloc_attr(g, 0, size, mem);
1525 int gk20a_gmmu_alloc_attr(struct gk20a *g, enum dma_attr attr, size_t size, struct mem_desc *mem)
1527 struct device *d = dev_from_gk20a(g);
1534 DEFINE_DMA_ATTRS(attrs);
1535 dma_set_attr(attr, &attrs);
1537 dma_alloc_attrs(d, size, &iova, GFP_KERNEL, &attrs);
1539 mem->cpu_va = dma_alloc_coherent(d, size, &iova, GFP_KERNEL);
1545 err = gk20a_get_sgtable(d, &mem->sgt, mem->cpu_va, iova, size);
1550 memset(mem->cpu_va, 0, size);
1552 gk20a_dbg_fn("done");
1557 dma_free_coherent(d, size, mem->cpu_va, iova);
1563 void gk20a_gmmu_free(struct gk20a *g, struct mem_desc *mem)
1565 struct device *d = dev_from_gk20a(g);
1568 dma_free_coherent(d, mem->size, mem->cpu_va,
1569 sg_dma_address(mem->sgt->sgl));
1573 gk20a_free_sgtable(&mem->sgt);
1576 int gk20a_gmmu_alloc_map(struct vm_gk20a *vm, size_t size, struct mem_desc *mem)
1578 return gk20a_gmmu_alloc_map_attr(vm, 0, size, mem);
1581 int gk20a_gmmu_alloc_map_attr(struct vm_gk20a *vm,
1582 enum dma_attr attr, size_t size, struct mem_desc *mem)
1584 int err = gk20a_gmmu_alloc_attr(vm->mm->g, attr, size, mem);
1589 mem->gpu_va = gk20a_gmmu_map(vm, &mem->sgt, size, 0, gk20a_mem_flag_none);
1598 gk20a_gmmu_free(vm->mm->g, mem);
1602 void gk20a_gmmu_unmap_free(struct vm_gk20a *vm, struct mem_desc *mem)
1605 gk20a_gmmu_unmap(vm, mem->gpu_va, mem->size, gk20a_mem_flag_none);
1608 gk20a_gmmu_free(vm->mm->g, mem);
1611 dma_addr_t gk20a_mm_gpuva_to_iova_base(struct vm_gk20a *vm, u64 gpu_vaddr)
1613 struct mapped_buffer_node *buffer;
1614 dma_addr_t addr = 0;
1616 mutex_lock(&vm->update_gmmu_lock);
1617 buffer = find_mapped_buffer_locked(&vm->mapped_buffers, gpu_vaddr);
1619 addr = gk20a_mm_iova_addr(vm->mm->g, buffer->sgt->sgl);
1620 mutex_unlock(&vm->update_gmmu_lock);
1625 void gk20a_gmmu_unmap(struct vm_gk20a *vm,
1630 struct gk20a *g = gk20a_from_vm(vm);
1632 mutex_lock(&vm->update_gmmu_lock);
1633 g->ops.mm.gmmu_unmap(vm,
1636 0, /* page size 4K */
1637 true, /*va_allocated */
1640 mutex_unlock(&vm->update_gmmu_lock);
1643 phys_addr_t gk20a_get_phys_from_iova(struct device *d,
1649 struct dma_iommu_mapping *mapping = to_dma_iommu_mapping(d);
1653 iova = dma_addr & PAGE_MASK;
1654 phys = iommu_iova_to_phys(mapping->domain, iova);
1658 /* get sg_table from already allocated buffer */
1659 int gk20a_get_sgtable(struct device *d, struct sg_table **sgt,
1660 void *cpuva, u64 iova,
1664 *sgt = kzalloc(sizeof(struct sg_table), GFP_KERNEL);
1666 dev_err(d, "failed to allocate memory\n");
1670 err = dma_get_sgtable(d, *sgt,
1674 dev_err(d, "failed to create sg table\n");
1677 sg_dma_address((*sgt)->sgl) = iova;
1688 int gk20a_get_sgtable_from_pages(struct device *d, struct sg_table **sgt,
1689 struct page **pages, u64 iova,
1693 *sgt = kzalloc(sizeof(struct sg_table), GFP_KERNEL);
1695 dev_err(d, "failed to allocate memory\n");
1699 err = sg_alloc_table(*sgt, 1, GFP_KERNEL);
1701 dev_err(d, "failed to allocate sg_table\n");
1704 sg_set_page((*sgt)->sgl, *pages, size, 0);
1705 sg_dma_address((*sgt)->sgl) = iova;
1716 void gk20a_free_sgtable(struct sg_table **sgt)
1718 sg_free_table(*sgt);
1723 u64 gk20a_mm_smmu_vaddr_translate(struct gk20a *g, dma_addr_t iova)
1725 if (!device_is_iommuable(dev_from_gk20a(g)))
1728 return iova | 1ULL << g->ops.mm.get_physical_addr_bits(g);
1731 u64 gk20a_mm_iova_addr(struct gk20a *g, struct scatterlist *sgl)
1733 if (!device_is_iommuable(dev_from_gk20a(g)))
1734 return sg_phys(sgl);
1736 if (sg_dma_address(sgl) == 0)
1737 return sg_phys(sgl);
1739 if (sg_dma_address(sgl) == DMA_ERROR_CODE)
1742 return gk20a_mm_smmu_vaddr_translate(g, sg_dma_address(sgl));
1745 /* for gk20a the "video memory" apertures here are misnomers. */
1746 static inline u32 big_valid_pde0_bits(u64 pte_addr)
1749 gmmu_pde_aperture_big_video_memory_f() |
1750 gmmu_pde_address_big_sys_f(
1751 (u32)(pte_addr >> gmmu_pde_address_shift_v()));
1755 static inline u32 small_valid_pde1_bits(u64 pte_addr)
1758 gmmu_pde_aperture_small_video_memory_f() |
1759 gmmu_pde_vol_small_true_f() | /* tbd: why? */
1760 gmmu_pde_address_small_sys_f(
1761 (u32)(pte_addr >> gmmu_pde_address_shift_v()));
1765 /* Given the current state of the ptes associated with a pde,
1766 determine value and write it out. There's no checking
1767 here to determine whether or not a change was actually
1768 made. So, superfluous updates will cause unnecessary
1771 static int update_gmmu_pde_locked(struct vm_gk20a *vm,
1772 struct gk20a_mm_entry *pte,
1773 u32 i, u32 gmmu_pgsz_idx,
1775 u32 kind_v, u32 *ctag,
1776 bool cacheable, bool unammped_pte,
1777 int rw_flag, bool sparse)
1779 bool small_valid, big_valid;
1780 u64 pte_addr_small = 0, pte_addr_big = 0;
1781 struct gk20a_mm_entry *entry = vm->pdb.entries + i;
1782 u32 pde_v[2] = {0, 0};
1787 small_valid = entry->size && entry->pgsz == gmmu_page_size_small;
1788 big_valid = entry->size && entry->pgsz == gmmu_page_size_big;
1791 pte_addr_small = gk20a_mm_iova_addr(vm->mm->g, entry->sgt->sgl);
1794 pte_addr_big = gk20a_mm_iova_addr(vm->mm->g, entry->sgt->sgl);
1796 pde_v[0] = gmmu_pde_size_full_f();
1797 pde_v[0] |= big_valid ? big_valid_pde0_bits(pte_addr_big) :
1798 (gmmu_pde_aperture_big_invalid_f());
1800 pde_v[1] |= (small_valid ?
1801 small_valid_pde1_bits(pte_addr_small) :
1802 (gmmu_pde_aperture_small_invalid_f() |
1803 gmmu_pde_vol_small_false_f()))
1805 (big_valid ? (gmmu_pde_vol_big_true_f()) :
1806 gmmu_pde_vol_big_false_f());
1808 pde = pde_from_index(vm, i);
1810 gk20a_mem_wr32(pde, 0, pde_v[0]);
1811 gk20a_mem_wr32(pde, 1, pde_v[1]);
1813 gk20a_dbg(gpu_dbg_pte, "pde:%d,sz=%d = 0x%x,0x%08x",
1814 i, gmmu_pgsz_idx, pde_v[1], pde_v[0]);
1818 static int update_gmmu_pte_locked(struct vm_gk20a *vm,
1819 struct gk20a_mm_entry *pte,
1820 u32 i, u32 gmmu_pgsz_idx,
1822 u32 kind_v, u32 *ctag,
1823 bool cacheable, bool unmapped_pte,
1824 int rw_flag, bool sparse)
1826 struct gk20a *g = gk20a_from_vm(vm);
1827 u32 ctag_granularity = g->ops.fb.compression_page_size(g);
1828 u32 page_size = vm->gmmu_page_sizes[gmmu_pgsz_idx];
1829 u32 pte_w[2] = {0, 0}; /* invalid pte */
1833 pte_w[0] = gmmu_pte_valid_false_f() |
1834 gmmu_pte_address_sys_f(iova
1835 >> gmmu_pte_address_shift_v());
1837 pte_w[0] = gmmu_pte_valid_true_f() |
1838 gmmu_pte_address_sys_f(iova
1839 >> gmmu_pte_address_shift_v());
1841 pte_w[1] = gmmu_pte_aperture_video_memory_f() |
1842 gmmu_pte_kind_f(kind_v) |
1843 gmmu_pte_comptagline_f(*ctag / ctag_granularity);
1845 if (rw_flag == gk20a_mem_flag_read_only) {
1846 pte_w[0] |= gmmu_pte_read_only_true_f();
1848 gmmu_pte_write_disable_true_f();
1849 } else if (rw_flag ==
1850 gk20a_mem_flag_write_only) {
1852 gmmu_pte_read_disable_true_f();
1854 if (!unmapped_pte) {
1857 gmmu_pte_vol_true_f();
1859 /* Store cachable value behind
1860 * gmmu_pte_write_disable_true_f */
1863 gmmu_pte_write_disable_true_f();
1867 gk20a_dbg(gpu_dbg_pte,
1868 "pte=%d iova=0x%llx kind=%d ctag=%d vol=%d [0x%08x, 0x%08x]",
1870 kind_v, *ctag / ctag_granularity, !cacheable,
1871 pte_w[1], pte_w[0]);
1875 } else if (sparse) {
1876 pte_w[0] = gmmu_pte_valid_false_f();
1877 pte_w[1] |= gmmu_pte_vol_true_f();
1879 gk20a_dbg(gpu_dbg_pte, "pte_cur=%d [0x0,0x0]", i);
1882 gk20a_mem_wr32(pte->cpu_va + i*8, 0, pte_w[0]);
1883 gk20a_mem_wr32(pte->cpu_va + i*8, 1, pte_w[1]);
1888 static int update_gmmu_level_locked(struct vm_gk20a *vm,
1889 struct gk20a_mm_entry *pte,
1890 enum gmmu_pgsz_gk20a pgsz_idx,
1892 u64 gpu_va, u64 gpu_end,
1893 u8 kind_v, u32 *ctag,
1894 bool cacheable, bool unmapped_pte,
1899 const struct gk20a_mmu_level *l = &vm->mmu_levels[lvl];
1900 const struct gk20a_mmu_level *next_l = &vm->mmu_levels[lvl+1];
1903 u64 pde_size = 1ULL << (u64)l->lo_bit[pgsz_idx];
1907 pde_i = (gpu_va & ((1ULL << ((u64)l->hi_bit[pgsz_idx]+1)) - 1ULL))
1908 >> (u64)l->lo_bit[pgsz_idx];
1910 gk20a_dbg(gpu_dbg_pte, "size_idx=%d, l: %d, [%llx,%llx], iova=%llx",
1911 pgsz_idx, lvl, gpu_va, gpu_end-1, iova);
1913 while (gpu_va < gpu_end) {
1914 struct gk20a_mm_entry *next_pte = NULL;
1915 u64 next = min((gpu_va + pde_size) & ~(pde_size-1), gpu_end);
1917 /* Allocate next level */
1918 if (next_l->update_entry) {
1919 if (!pte->entries) {
1922 (l->hi_bit[pgsz_idx]
1923 - l->lo_bit[pgsz_idx]);
1925 vzalloc(sizeof(struct gk20a_mm_entry) *
1927 pte->pgsz = pgsz_idx;
1931 next_pte = pte->entries + pde_i;
1933 if (!next_pte->size) {
1934 err = gk20a_zalloc_gmmu_page_table(vm,
1935 pgsz_idx, next_l, next_pte);
1941 err = l->update_entry(vm, pte, pde_i, pgsz_idx,
1942 iova, kind_v, ctag, cacheable, unmapped_pte,
1947 if (next_l->update_entry) {
1948 /* get cpu access to the ptes */
1949 err = map_gmmu_pages(next_pte);
1951 gk20a_err(dev_from_vm(vm),
1952 "couldn't map ptes for update as=%d",
1956 err = update_gmmu_level_locked(vm, next_pte,
1961 kind_v, ctag, cacheable, unmapped_pte,
1962 rw_flag, sparse, lvl+1);
1963 unmap_gmmu_pages(next_pte);
1970 iova += next - gpu_va;
1975 gk20a_dbg_fn("done");
1980 static int update_gmmu_ptes_locked(struct vm_gk20a *vm,
1981 enum gmmu_pgsz_gk20a pgsz_idx,
1982 struct sg_table *sgt,
1984 u64 gpu_va, u64 gpu_end,
1985 u8 kind_v, u32 ctag_offset,
1986 bool cacheable, bool unmapped_pte,
1990 struct gk20a *g = gk20a_from_vm(vm);
1991 int ctag_granularity = g->ops.fb.compression_page_size(g);
1992 u32 ctag = ctag_offset * ctag_granularity;
1994 u64 space_to_skip = buffer_offset;
1995 u32 page_size = vm->gmmu_page_sizes[pgsz_idx];
1998 gk20a_dbg(gpu_dbg_pte, "size_idx=%d, iova=%llx",
2000 sgt ? gk20a_mm_iova_addr(vm->mm->g, sgt->sgl) : 0ULL);
2002 if (space_to_skip & (page_size - 1))
2006 iova = gk20a_mm_iova_addr(vm->mm->g, sgt->sgl) + space_to_skip;
2008 gk20a_dbg(gpu_dbg_map, "size_idx=%d, gpu_va=[%llx,%llx], iova=%llx",
2009 pgsz_idx, gpu_va, gpu_end-1, iova);
2010 err = map_gmmu_pages(&vm->pdb);
2012 gk20a_err(dev_from_vm(vm),
2013 "couldn't map ptes for update as=%d",
2017 err = update_gmmu_level_locked(vm, &vm->pdb, pgsz_idx,
2021 cacheable, unmapped_pte, rw_flag, sparse, 0);
2022 unmap_gmmu_pages(&vm->pdb);
2026 gk20a_dbg_fn("done");
2031 /* NOTE! mapped_buffers lock must be held */
2032 void gk20a_vm_unmap_locked(struct mapped_buffer_node *mapped_buffer)
2034 struct vm_gk20a *vm = mapped_buffer->vm;
2035 struct gk20a *g = vm->mm->g;
2037 g->ops.mm.gmmu_unmap(vm,
2038 mapped_buffer->addr,
2039 mapped_buffer->size,
2040 mapped_buffer->pgsz_idx,
2041 mapped_buffer->va_allocated,
2042 gk20a_mem_flag_none,
2043 mapped_buffer->va_node ?
2044 mapped_buffer->va_node->sparse : false);
2046 gk20a_dbg(gpu_dbg_map, "as=%d pgsz=%d gv=0x%x,%08x own_mem_ref=%d",
2048 vm->gmmu_page_sizes[mapped_buffer->pgsz_idx],
2049 hi32(mapped_buffer->addr), lo32(mapped_buffer->addr),
2050 mapped_buffer->own_mem_ref);
2052 gk20a_mm_unpin(dev_from_vm(vm), mapped_buffer->dmabuf,
2053 mapped_buffer->sgt);
2055 /* remove from mapped buffer tree and remove list, free */
2056 rb_erase(&mapped_buffer->node, &vm->mapped_buffers);
2057 if (!list_empty(&mapped_buffer->va_buffers_list))
2058 list_del(&mapped_buffer->va_buffers_list);
2060 /* keep track of mapped buffers */
2061 if (mapped_buffer->user_mapped)
2062 vm->num_user_mapped_buffers--;
2064 if (mapped_buffer->own_mem_ref)
2065 dma_buf_put(mapped_buffer->dmabuf);
2067 kfree(mapped_buffer);
2072 void gk20a_vm_unmap(struct vm_gk20a *vm, u64 offset)
2074 struct device *d = dev_from_vm(vm);
2075 struct mapped_buffer_node *mapped_buffer;
2077 mutex_lock(&vm->update_gmmu_lock);
2078 mapped_buffer = find_mapped_buffer_locked(&vm->mapped_buffers, offset);
2079 if (!mapped_buffer) {
2080 mutex_unlock(&vm->update_gmmu_lock);
2081 gk20a_err(d, "invalid addr to unmap 0x%llx", offset);
2085 kref_put(&mapped_buffer->ref, gk20a_vm_unmap_locked_kref);
2086 mutex_unlock(&vm->update_gmmu_lock);
2089 static void gk20a_vm_remove_support_nofree(struct vm_gk20a *vm)
2091 struct mapped_buffer_node *mapped_buffer;
2092 struct vm_reserved_va_node *va_node, *va_node_tmp;
2093 struct rb_node *node;
2095 u32 pde_lo = 0, pde_hi = 0;
2098 mutex_lock(&vm->update_gmmu_lock);
2100 /* TBD: add a flag here for the unmap code to recognize teardown
2101 * and short-circuit any otherwise expensive operations. */
2103 node = rb_first(&vm->mapped_buffers);
2106 container_of(node, struct mapped_buffer_node, node);
2107 gk20a_vm_unmap_locked(mapped_buffer);
2108 node = rb_first(&vm->mapped_buffers);
2111 /* destroy remaining reserved memory areas */
2112 list_for_each_entry_safe(va_node, va_node_tmp, &vm->reserved_va_list,
2114 list_del(&va_node->reserved_va_list);
2118 /* unmapping all buffers above may not actually free
2119 * all vm ptes. jettison them here for certain... */
2120 pde_range_from_vaddr_range(vm,
2123 for (i = 0; i < pde_hi + 1; i++) {
2124 struct gk20a_mm_entry *entry = &vm->pdb.entries[i];
2126 free_gmmu_pages(vm, entry);
2129 unmap_gmmu_pages(&vm->pdb);
2130 free_gmmu_pages(vm, &vm->pdb);
2132 vfree(vm->pdb.entries);
2133 gk20a_allocator_destroy(&vm->vma[gmmu_page_size_small]);
2135 gk20a_allocator_destroy(&vm->vma[gmmu_page_size_big]);
2137 mutex_unlock(&vm->update_gmmu_lock);
2140 void gk20a_vm_remove_support(struct vm_gk20a *vm)
2142 gk20a_vm_remove_support_nofree(vm);
2143 /* vm is not used anymore. release it. */
2147 static void gk20a_vm_remove_support_kref(struct kref *ref)
2149 struct vm_gk20a *vm = container_of(ref, struct vm_gk20a, ref);
2150 struct gk20a *g = gk20a_from_vm(vm);
2151 g->ops.mm.vm_remove(vm);
2154 void gk20a_vm_get(struct vm_gk20a *vm)
2159 void gk20a_vm_put(struct vm_gk20a *vm)
2161 kref_put(&vm->ref, gk20a_vm_remove_support_kref);
2164 const struct gk20a_mmu_level gk20a_mm_levels_64k[] = {
2165 {.hi_bit = {NV_GMMU_VA_RANGE-1, NV_GMMU_VA_RANGE-1},
2167 .update_entry = update_gmmu_pde_locked,
2169 {.hi_bit = {25, 25},
2171 .update_entry = update_gmmu_pte_locked,
2173 {.update_entry = NULL}
2176 const struct gk20a_mmu_level gk20a_mm_levels_128k[] = {
2177 {.hi_bit = {NV_GMMU_VA_RANGE-1, NV_GMMU_VA_RANGE-1},
2179 .update_entry = update_gmmu_pde_locked,
2181 {.hi_bit = {26, 26},
2183 .update_entry = update_gmmu_pte_locked,
2185 {.update_entry = NULL}
2188 int gk20a_init_vm(struct mm_gk20a *mm,
2189 struct vm_gk20a *vm,
2197 u32 num_small_pages, num_large_pages, low_hole_pages;
2198 char alloc_name[32];
2199 u64 small_vma_size, large_vma_size;
2202 /* note: keep the page sizes sorted lowest to highest here */
2203 u32 gmmu_page_sizes[gmmu_nr_page_sizes] = { SZ_4K, big_page_size };
2207 vm->va_start = low_hole;
2208 vm->va_limit = aperture_size;
2209 vm->big_pages = big_pages;
2211 vm->big_page_size = gmmu_page_sizes[gmmu_page_size_big];
2213 vm->mmu_levels = vm->mm->g->ops.mm.get_mmu_levels(vm->mm->g,
2216 for (i = 0; i < gmmu_nr_page_sizes; i++)
2217 vm->gmmu_page_sizes[i] = gmmu_page_sizes[i];
2219 gk20a_dbg_info("small page-size (%dKB)",
2220 vm->gmmu_page_sizes[gmmu_page_size_small] >> 10);
2222 gk20a_dbg_info("big page-size (%dKB)",
2223 vm->gmmu_page_sizes[gmmu_page_size_big] >> 10);
2225 pde_range_from_vaddr_range(vm,
2228 vm->pdb.entries = vzalloc(sizeof(struct gk20a_mm_entry) *
2231 if (!vm->pdb.entries) {
2236 gk20a_dbg_info("init space for %s va_limit=0x%llx num_pdes=%d",
2237 name, vm->va_limit, pde_hi + 1);
2239 /* allocate the page table directory */
2240 err = gk20a_zalloc_gmmu_page_table(vm, 0, &vm->mmu_levels[0], &vm->pdb);
2244 /* First 16GB of the address space goes towards small pages. What ever
2245 * remains is allocated to large pages. */
2246 small_vma_size = vm->va_limit;
2248 small_vma_size = (u64)16 << 30;
2249 large_vma_size = vm->va_limit - small_vma_size;
2252 num_small_pages = (u32)(small_vma_size >>
2253 ilog2(vm->gmmu_page_sizes[gmmu_page_size_small]));
2255 /* num_pages above is without regard to the low-side hole. */
2256 low_hole_pages = (vm->va_start >>
2257 ilog2(vm->gmmu_page_sizes[gmmu_page_size_small]));
2259 snprintf(alloc_name, sizeof(alloc_name), "gk20a_%s-%dKB", name,
2260 vm->gmmu_page_sizes[gmmu_page_size_small]>>10);
2261 err = gk20a_allocator_init(&vm->vma[gmmu_page_size_small],
2263 low_hole_pages, /*start*/
2264 num_small_pages - low_hole_pages);/* length*/
2266 goto clean_up_map_pde;
2269 u32 start = (u32)(small_vma_size >>
2270 ilog2(vm->gmmu_page_sizes[gmmu_page_size_big]));
2271 num_large_pages = (u32)(large_vma_size >>
2272 ilog2(vm->gmmu_page_sizes[gmmu_page_size_big]));
2274 snprintf(alloc_name, sizeof(alloc_name), "gk20a_%s-%dKB",
2275 name, vm->gmmu_page_sizes[gmmu_page_size_big]>>10);
2276 err = gk20a_allocator_init(&vm->vma[gmmu_page_size_big],
2279 num_large_pages); /* length */
2281 goto clean_up_small_allocator;
2284 vm->mapped_buffers = RB_ROOT;
2286 mutex_init(&vm->update_gmmu_lock);
2287 kref_init(&vm->ref);
2288 INIT_LIST_HEAD(&vm->reserved_va_list);
2292 clean_up_small_allocator:
2293 gk20a_allocator_destroy(&vm->vma[gmmu_page_size_small]);
2295 unmap_gmmu_pages(&vm->pdb);
2297 free_gmmu_pages(vm, &vm->pdb);
2299 vfree(vm->pdb.entries);
2303 /* address space interfaces for the gk20a module */
2304 int gk20a_vm_alloc_share(struct gk20a_as_share *as_share, u32 big_page_size)
2306 struct gk20a_as *as = as_share->as;
2307 struct gk20a *g = gk20a_from_as(as);
2308 struct mm_gk20a *mm = &g->mm;
2309 struct vm_gk20a *vm;
2315 if (big_page_size == 0)
2317 gk20a_get_platform(g->dev)->default_big_page_size;
2319 if (!is_power_of_2(big_page_size))
2322 if (!(big_page_size & g->gpu_characteristics.available_big_page_sizes))
2325 vm = kzalloc(sizeof(*vm), GFP_KERNEL);
2330 vm->as_share = as_share;
2331 vm->enable_ctag = true;
2333 snprintf(name, sizeof(name), "gk20a_as_%d", as_share->id);
2335 err = gk20a_init_vm(mm, vm, big_page_size, big_page_size << 10,
2336 mm->channel.size, true, name);
2341 int gk20a_vm_release_share(struct gk20a_as_share *as_share)
2343 struct vm_gk20a *vm = as_share->vm;
2347 vm->as_share = NULL;
2349 /* put as reference to vm */
2352 as_share->vm = NULL;
2358 int gk20a_vm_alloc_space(struct gk20a_as_share *as_share,
2359 struct nvgpu_as_alloc_space_args *args)
2361 { int err = -ENOMEM;
2364 struct gk20a_allocator *vma;
2365 struct vm_gk20a *vm = as_share->vm;
2366 struct gk20a *g = vm->mm->g;
2367 struct vm_reserved_va_node *va_node;
2368 u64 vaddr_start = 0;
2370 gk20a_dbg_fn("flags=0x%x pgsz=0x%x nr_pages=0x%x o/a=0x%llx",
2371 args->flags, args->page_size, args->pages,
2374 /* determine pagesz idx */
2375 for (pgsz_idx = gmmu_page_size_small;
2376 pgsz_idx < gmmu_nr_page_sizes;
2378 if (vm->gmmu_page_sizes[pgsz_idx] == args->page_size)
2382 if (pgsz_idx >= gmmu_nr_page_sizes) {
2387 va_node = kzalloc(sizeof(*va_node), GFP_KERNEL);
2393 if (args->flags & NVGPU_AS_ALLOC_SPACE_FLAGS_SPARSE &&
2394 pgsz_idx != gmmu_page_size_big) {
2401 if (args->flags & NVGPU_AS_ALLOC_SPACE_FLAGS_FIXED_OFFSET)
2402 start_page_nr = (u32)(args->o_a.offset >>
2403 ilog2(vm->gmmu_page_sizes[pgsz_idx]));
2405 vma = &vm->vma[pgsz_idx];
2406 err = vma->alloc(vma, &start_page_nr, args->pages, 1);
2412 vaddr_start = (u64)start_page_nr <<
2413 ilog2(vm->gmmu_page_sizes[pgsz_idx]);
2415 va_node->vaddr_start = vaddr_start;
2416 va_node->size = (u64)args->page_size * (u64)args->pages;
2417 va_node->pgsz_idx = pgsz_idx;
2418 INIT_LIST_HEAD(&va_node->va_buffers_list);
2419 INIT_LIST_HEAD(&va_node->reserved_va_list);
2421 mutex_lock(&vm->update_gmmu_lock);
2423 /* mark that we need to use sparse mappings here */
2424 if (args->flags & NVGPU_AS_ALLOC_SPACE_FLAGS_SPARSE) {
2425 u64 map_offset = g->ops.mm.gmmu_map(vm, vaddr_start,
2433 gk20a_mem_flag_none,
2437 mutex_unlock(&vm->update_gmmu_lock);
2438 vma->free(vma, start_page_nr, args->pages, 1);
2443 va_node->sparse = true;
2445 list_add_tail(&va_node->reserved_va_list, &vm->reserved_va_list);
2447 mutex_unlock(&vm->update_gmmu_lock);
2449 args->o_a.offset = vaddr_start;
2455 int gk20a_vm_free_space(struct gk20a_as_share *as_share,
2456 struct nvgpu_as_free_space_args *args)
2461 struct gk20a_allocator *vma;
2462 struct vm_gk20a *vm = as_share->vm;
2463 struct vm_reserved_va_node *va_node;
2464 struct gk20a *g = gk20a_from_vm(vm);
2466 gk20a_dbg_fn("pgsz=0x%x nr_pages=0x%x o/a=0x%llx", args->page_size,
2467 args->pages, args->offset);
2469 /* determine pagesz idx */
2470 for (pgsz_idx = gmmu_page_size_small;
2471 pgsz_idx < gmmu_nr_page_sizes;
2473 if (vm->gmmu_page_sizes[pgsz_idx] == args->page_size)
2477 if (pgsz_idx >= gmmu_nr_page_sizes) {
2482 start_page_nr = (u32)(args->offset >>
2483 ilog2(vm->gmmu_page_sizes[pgsz_idx]));
2485 vma = &vm->vma[pgsz_idx];
2486 err = vma->free(vma, start_page_nr, args->pages, 1);
2491 mutex_lock(&vm->update_gmmu_lock);
2492 va_node = addr_to_reservation(vm, args->offset);
2494 struct mapped_buffer_node *buffer, *n;
2496 /* Decrement the ref count on all buffers in this va_node. This
2497 * allows userspace to let the kernel free mappings that are
2498 * only used by this va_node. */
2499 list_for_each_entry_safe(buffer, n,
2500 &va_node->va_buffers_list, va_buffers_list) {
2501 list_del_init(&buffer->va_buffers_list);
2502 kref_put(&buffer->ref, gk20a_vm_unmap_locked_kref);
2505 list_del(&va_node->reserved_va_list);
2507 /* if this was a sparse mapping, free the va */
2508 if (va_node->sparse)
2509 g->ops.mm.gmmu_unmap(vm,
2510 va_node->vaddr_start,
2514 gk20a_mem_flag_none,
2518 mutex_unlock(&vm->update_gmmu_lock);
2524 int gk20a_vm_bind_channel(struct gk20a_as_share *as_share,
2525 struct channel_gk20a *ch)
2528 struct vm_gk20a *vm = as_share->vm;
2533 err = channel_gk20a_commit_va(ch);
2540 int gk20a_dmabuf_alloc_drvdata(struct dma_buf *dmabuf, struct device *dev)
2542 struct gk20a_dmabuf_priv *priv;
2543 static DEFINE_MUTEX(priv_lock);
2545 priv = dma_buf_get_drvdata(dmabuf, dev);
2549 mutex_lock(&priv_lock);
2550 priv = dma_buf_get_drvdata(dmabuf, dev);
2552 goto priv_exist_or_err;
2553 priv = kzalloc(sizeof(*priv), GFP_KERNEL);
2555 priv = ERR_PTR(-ENOMEM);
2556 goto priv_exist_or_err;
2558 mutex_init(&priv->lock);
2559 INIT_LIST_HEAD(&priv->states);
2560 dma_buf_set_drvdata(dmabuf, dev, priv, gk20a_mm_delete_priv);
2562 mutex_unlock(&priv_lock);
2569 int gk20a_dmabuf_get_state(struct dma_buf *dmabuf, struct device *dev,
2570 u64 offset, struct gk20a_buffer_state **state)
2573 struct gk20a_dmabuf_priv *priv;
2574 struct gk20a_buffer_state *s;
2576 if (WARN_ON(offset >= (u64)dmabuf->size))
2579 err = gk20a_dmabuf_alloc_drvdata(dmabuf, dev);
2583 priv = dma_buf_get_drvdata(dmabuf, dev);
2587 mutex_lock(&priv->lock);
2589 list_for_each_entry(s, &priv->states, list)
2590 if (s->offset == offset)
2593 /* State not found, create state. */
2594 s = kzalloc(sizeof(*s), GFP_KERNEL);
2601 INIT_LIST_HEAD(&s->list);
2602 mutex_init(&s->lock);
2603 list_add_tail(&s->list, &priv->states);
2606 mutex_unlock(&priv->lock);
2614 static int gk20a_dmabuf_get_kind(struct dma_buf *dmabuf)
2617 #ifdef CONFIG_TEGRA_NVMAP
2621 err = nvmap_get_dmabuf_param(dmabuf, NVMAP_HANDLE_PARAM_KIND,
2623 kind = err ? kind : nvmap_param;
2628 int gk20a_vm_map_buffer(struct vm_gk20a *vm,
2631 u32 flags, /*NVGPU_AS_MAP_BUFFER_FLAGS_*/
2637 struct dma_buf *dmabuf;
2642 /* get ref to the mem handle (released on unmap_locked) */
2643 dmabuf = dma_buf_get(dmabuf_fd);
2645 return PTR_ERR(dmabuf);
2647 err = gk20a_dmabuf_alloc_drvdata(dmabuf, dev_from_vm(vm));
2649 dma_buf_put(dmabuf);
2654 kind = gk20a_dmabuf_get_kind(dmabuf);
2656 ret_va = gk20a_vm_map(vm, dmabuf, *offset_align,
2657 flags, kind, NULL, true,
2658 gk20a_mem_flag_none,
2662 *offset_align = ret_va;
2664 dma_buf_put(dmabuf);
2671 int gk20a_vm_unmap_buffer(struct vm_gk20a *vm, u64 offset)
2675 gk20a_vm_unmap_user(vm, offset);
2679 void gk20a_deinit_vm(struct vm_gk20a *vm)
2681 gk20a_allocator_destroy(&vm->vma[gmmu_page_size_big]);
2682 gk20a_allocator_destroy(&vm->vma[gmmu_page_size_small]);
2684 unmap_gmmu_pages(&vm->pdb);
2685 free_gmmu_pages(vm, &vm->pdb);
2686 vfree(vm->pdb.entries);
2689 int gk20a_alloc_inst_block(struct gk20a *g, struct mem_desc *inst_block)
2691 struct device *dev = dev_from_gk20a(g);
2694 err = gk20a_gmmu_alloc(g, ram_in_alloc_size_v(), inst_block);
2696 gk20a_err(dev, "%s: memory allocation failed\n", __func__);
2703 void gk20a_free_inst_block(struct gk20a *g, struct mem_desc *inst_block)
2705 if (inst_block->cpu_va)
2706 gk20a_gmmu_free(g, inst_block);
2709 static int gk20a_init_bar1_vm(struct mm_gk20a *mm)
2712 struct vm_gk20a *vm = &mm->bar1.vm;
2713 struct gk20a *g = gk20a_from_mm(mm);
2714 struct mem_desc *inst_block = &mm->bar1.inst_block;
2715 u32 big_page_size = gk20a_get_platform(g->dev)->default_big_page_size;
2717 mm->bar1.aperture_size = bar1_aperture_size_mb_gk20a() << 20;
2718 gk20a_dbg_info("bar1 vm size = 0x%x", mm->bar1.aperture_size);
2719 gk20a_init_vm(mm, vm, big_page_size, SZ_4K,
2720 mm->bar1.aperture_size, false, "bar1");
2722 err = gk20a_alloc_inst_block(g, inst_block);
2725 gk20a_init_inst_block(inst_block, vm, big_page_size);
2730 gk20a_deinit_vm(vm);
2734 /* pmu vm, share channel_vm interfaces */
2735 static int gk20a_init_system_vm(struct mm_gk20a *mm)
2738 struct vm_gk20a *vm = &mm->pmu.vm;
2739 struct gk20a *g = gk20a_from_mm(mm);
2740 struct mem_desc *inst_block = &mm->pmu.inst_block;
2741 u32 big_page_size = gk20a_get_platform(g->dev)->default_big_page_size;
2743 mm->pmu.aperture_size = GK20A_PMU_VA_SIZE;
2744 gk20a_dbg_info("pmu vm size = 0x%x", mm->pmu.aperture_size);
2746 gk20a_init_vm(mm, vm, big_page_size,
2747 SZ_128K << 10, GK20A_PMU_VA_SIZE, false, "system");
2749 err = gk20a_alloc_inst_block(g, inst_block);
2752 gk20a_init_inst_block(inst_block, vm, big_page_size);
2757 gk20a_deinit_vm(vm);
2761 static int gk20a_init_hwpm(struct mm_gk20a *mm)
2764 struct vm_gk20a *vm = &mm->pmu.vm;
2765 struct gk20a *g = gk20a_from_mm(mm);
2766 struct mem_desc *inst_block = &mm->hwpm.inst_block;
2768 err = gk20a_alloc_inst_block(g, inst_block);
2771 gk20a_init_inst_block(inst_block, vm, 0);
2776 void gk20a_mm_init_pdb(struct gk20a *g, void *inst_ptr, u64 pdb_addr)
2778 u32 pdb_addr_lo = u64_lo32(pdb_addr >> ram_in_base_shift_v());
2779 u32 pdb_addr_hi = u64_hi32(pdb_addr);
2781 gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_lo_w(),
2782 ram_in_page_dir_base_target_vid_mem_f() |
2783 ram_in_page_dir_base_vol_true_f() |
2784 ram_in_page_dir_base_lo_f(pdb_addr_lo));
2786 gk20a_mem_wr32(inst_ptr, ram_in_page_dir_base_hi_w(),
2787 ram_in_page_dir_base_hi_f(pdb_addr_hi));
2790 void gk20a_init_inst_block(struct mem_desc *inst_block, struct vm_gk20a *vm,
2793 struct gk20a *g = gk20a_from_vm(vm);
2794 u64 pde_addr = gk20a_mm_iova_addr(g, vm->pdb.sgt->sgl);
2795 phys_addr_t inst_pa = gk20a_mem_phys(inst_block);
2796 void *inst_ptr = inst_block->cpu_va;
2798 gk20a_dbg_info("inst block phys = 0x%llx, kv = 0x%p",
2799 (u64)inst_pa, inst_ptr);
2801 gk20a_dbg_info("pde pa=0x%llx", (u64)pde_addr);
2803 g->ops.mm.init_pdb(g, inst_ptr, pde_addr);
2805 gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_lo_w(),
2806 u64_lo32(vm->va_limit) | 0xFFF);
2808 gk20a_mem_wr32(inst_ptr, ram_in_adr_limit_hi_w(),
2809 ram_in_adr_limit_hi_f(u64_hi32(vm->va_limit)));
2811 if (big_page_size && g->ops.mm.set_big_page_size)
2812 g->ops.mm.set_big_page_size(g, inst_ptr, big_page_size);
2815 int gk20a_mm_fb_flush(struct gk20a *g)
2817 struct mm_gk20a *mm = &g->mm;
2824 mutex_lock(&mm->l2_op_lock);
2826 /* Make sure all previous writes are committed to the L2. There's no
2827 guarantee that writes are to DRAM. This will be a sysmembar internal
2830 trace_gk20a_mm_fb_flush(g->dev->name);
2832 gk20a_writel(g, flush_fb_flush_r(),
2833 flush_fb_flush_pending_busy_f());
2836 data = gk20a_readl(g, flush_fb_flush_r());
2838 if (flush_fb_flush_outstanding_v(data) ==
2839 flush_fb_flush_outstanding_true_v() ||
2840 flush_fb_flush_pending_v(data) ==
2841 flush_fb_flush_pending_busy_v()) {
2842 gk20a_dbg_info("fb_flush 0x%x", data);
2847 } while (retry >= 0 || !tegra_platform_is_silicon());
2849 if (tegra_platform_is_silicon() && retry < 0) {
2850 gk20a_warn(dev_from_gk20a(g),
2851 "fb_flush too many retries");
2852 if (g->ops.fb.dump_vpr_wpr_info)
2853 g->ops.fb.dump_vpr_wpr_info(g);
2857 trace_gk20a_mm_fb_flush_done(g->dev->name);
2859 mutex_unlock(&mm->l2_op_lock);
2864 static void gk20a_mm_l2_invalidate_locked(struct gk20a *g)
2869 trace_gk20a_mm_l2_invalidate(g->dev->name);
2871 /* Invalidate any clean lines from the L2 so subsequent reads go to
2872 DRAM. Dirty lines are not affected by this operation. */
2873 gk20a_writel(g, flush_l2_system_invalidate_r(),
2874 flush_l2_system_invalidate_pending_busy_f());
2877 data = gk20a_readl(g, flush_l2_system_invalidate_r());
2879 if (flush_l2_system_invalidate_outstanding_v(data) ==
2880 flush_l2_system_invalidate_outstanding_true_v() ||
2881 flush_l2_system_invalidate_pending_v(data) ==
2882 flush_l2_system_invalidate_pending_busy_v()) {
2883 gk20a_dbg_info("l2_system_invalidate 0x%x",
2889 } while (retry >= 0 || !tegra_platform_is_silicon());
2891 if (tegra_platform_is_silicon() && retry < 0)
2892 gk20a_warn(dev_from_gk20a(g),
2893 "l2_system_invalidate too many retries");
2895 trace_gk20a_mm_l2_invalidate_done(g->dev->name);
2898 void gk20a_mm_l2_invalidate(struct gk20a *g)
2900 struct mm_gk20a *mm = &g->mm;
2901 gk20a_busy_noresume(g->dev);
2903 mutex_lock(&mm->l2_op_lock);
2904 gk20a_mm_l2_invalidate_locked(g);
2905 mutex_unlock(&mm->l2_op_lock);
2907 pm_runtime_put_noidle(&g->dev->dev);
2910 void gk20a_mm_l2_flush(struct gk20a *g, bool invalidate)
2912 struct mm_gk20a *mm = &g->mm;
2918 gk20a_busy_noresume(g->dev);
2922 mutex_lock(&mm->l2_op_lock);
2924 trace_gk20a_mm_l2_flush(g->dev->name);
2926 /* Flush all dirty lines from the L2 to DRAM. Lines are left in the L2
2927 as clean, so subsequent reads might hit in the L2. */
2928 gk20a_writel(g, flush_l2_flush_dirty_r(),
2929 flush_l2_flush_dirty_pending_busy_f());
2932 data = gk20a_readl(g, flush_l2_flush_dirty_r());
2934 if (flush_l2_flush_dirty_outstanding_v(data) ==
2935 flush_l2_flush_dirty_outstanding_true_v() ||
2936 flush_l2_flush_dirty_pending_v(data) ==
2937 flush_l2_flush_dirty_pending_busy_v()) {
2938 gk20a_dbg_info("l2_flush_dirty 0x%x", data);
2943 } while (retry >= 0 || !tegra_platform_is_silicon());
2945 if (tegra_platform_is_silicon() && retry < 0)
2946 gk20a_warn(dev_from_gk20a(g),
2947 "l2_flush_dirty too many retries");
2949 trace_gk20a_mm_l2_flush_done(g->dev->name);
2952 gk20a_mm_l2_invalidate_locked(g);
2954 mutex_unlock(&mm->l2_op_lock);
2957 pm_runtime_put_noidle(&g->dev->dev);
2961 int gk20a_vm_find_buffer(struct vm_gk20a *vm, u64 gpu_va,
2962 struct dma_buf **dmabuf,
2965 struct mapped_buffer_node *mapped_buffer;
2967 gk20a_dbg_fn("gpu_va=0x%llx", gpu_va);
2969 mutex_lock(&vm->update_gmmu_lock);
2971 mapped_buffer = find_mapped_buffer_range_locked(&vm->mapped_buffers,
2973 if (!mapped_buffer) {
2974 mutex_unlock(&vm->update_gmmu_lock);
2978 *dmabuf = mapped_buffer->dmabuf;
2979 *offset = gpu_va - mapped_buffer->addr;
2981 mutex_unlock(&vm->update_gmmu_lock);
2986 void gk20a_mm_tlb_invalidate(struct vm_gk20a *vm)
2988 struct gk20a *g = gk20a_from_vm(vm);
2989 u32 addr_lo = u64_lo32(gk20a_mm_iova_addr(vm->mm->g,
2990 vm->pdb.sgt->sgl) >> 12);
2993 static DEFINE_MUTEX(tlb_lock);
2997 /* pagetables are considered sw states which are preserved after
2998 prepare_poweroff. When gk20a deinit releases those pagetables,
2999 common code in vm unmap path calls tlb invalidate that touches
3000 hw. Use the power_on flag to skip tlb invalidation when gpu
3001 power is turned off */
3006 mutex_lock(&tlb_lock);
3008 trace_gk20a_mm_tlb_invalidate(g->dev->name);
3011 data = gk20a_readl(g, fb_mmu_ctrl_r());
3012 if (fb_mmu_ctrl_pri_fifo_space_v(data) != 0)
3016 } while (retry >= 0 || !tegra_platform_is_silicon());
3018 if (tegra_platform_is_silicon() && retry < 0) {
3019 gk20a_warn(dev_from_gk20a(g),
3020 "wait mmu fifo space too many retries");
3024 gk20a_writel(g, fb_mmu_invalidate_pdb_r(),
3025 fb_mmu_invalidate_pdb_addr_f(addr_lo) |
3026 fb_mmu_invalidate_pdb_aperture_vid_mem_f());
3028 gk20a_writel(g, fb_mmu_invalidate_r(),
3029 fb_mmu_invalidate_all_va_true_f() |
3030 fb_mmu_invalidate_trigger_true_f());
3033 data = gk20a_readl(g, fb_mmu_ctrl_r());
3034 if (fb_mmu_ctrl_pri_fifo_empty_v(data) !=
3035 fb_mmu_ctrl_pri_fifo_empty_false_f())
3039 } while (retry >= 0 || !tegra_platform_is_silicon());
3041 if (tegra_platform_is_silicon() && retry < 0)
3042 gk20a_warn(dev_from_gk20a(g),
3043 "mmu invalidate too many retries");
3045 trace_gk20a_mm_tlb_invalidate_done(g->dev->name);
3048 mutex_unlock(&tlb_lock);
3051 int gk20a_mm_suspend(struct gk20a *g)
3055 g->ops.ltc.elpg_flush(g);
3057 gk20a_dbg_fn("done");
3061 bool gk20a_mm_mmu_debug_mode_enabled(struct gk20a *g)
3063 u32 debug_ctrl = gk20a_readl(g, fb_mmu_debug_ctrl_r());
3064 return fb_mmu_debug_ctrl_debug_v(debug_ctrl) ==
3065 fb_mmu_debug_ctrl_debug_enabled_v();
3068 u32 gk20a_mm_get_physical_addr_bits(struct gk20a *g)
3073 const struct gk20a_mmu_level *gk20a_mm_get_mmu_levels(struct gk20a *g,
3076 return (big_page_size == SZ_64K) ?
3077 gk20a_mm_levels_64k : gk20a_mm_levels_128k;
3080 void gk20a_init_mm(struct gpu_ops *gops)
3082 gops->mm.is_debug_mode_enabled = gk20a_mm_mmu_debug_mode_enabled;
3083 gops->mm.gmmu_map = gk20a_locked_gmmu_map;
3084 gops->mm.gmmu_unmap = gk20a_locked_gmmu_unmap;
3085 gops->mm.vm_remove = gk20a_vm_remove_support;
3086 gops->mm.vm_alloc_share = gk20a_vm_alloc_share;
3087 gops->mm.vm_bind_channel = gk20a_vm_bind_channel;
3088 gops->mm.fb_flush = gk20a_mm_fb_flush;
3089 gops->mm.l2_invalidate = gk20a_mm_l2_invalidate;
3090 gops->mm.l2_flush = gk20a_mm_l2_flush;
3091 gops->mm.tlb_invalidate = gk20a_mm_tlb_invalidate;
3092 gops->mm.get_physical_addr_bits = gk20a_mm_get_physical_addr_bits;
3093 gops->mm.get_mmu_levels = gk20a_mm_get_mmu_levels;
3094 gops->mm.init_pdb = gk20a_mm_init_pdb;
3095 gops->mm.init_mm_setup_hw = gk20a_init_mm_setup_hw;