2 * Jailhouse, a Linux-based partitioning hypervisor
4 * Copyright (c) Siemens AG, 2013-2016
5 * Copyright (c) Valentine Sinitsyn, 2014
8 * Jan Kiszka <jan.kiszka@siemens.com>
9 * Valentine Sinitsyn <valentine.sinitsyn@gmail.com>
11 * This work is licensed under the terms of the GNU GPL, version 2. See
12 * the COPYING file in the top-level directory.
15 #include <jailhouse/control.h>
16 #include <jailhouse/mmio.h>
17 #include <jailhouse/paging.h>
18 #include <jailhouse/pci.h>
19 #include <jailhouse/printk.h>
20 #include <jailhouse/string.h>
22 #include <asm/iommu.h>
23 #include <asm/bitops.h>
24 #include <asm/ioapic.h>
25 #include <asm/spinlock.h>
27 #define VTD_ROOT_PRESENT 0x00000001
29 #define VTD_CTX_PRESENT 0x00000001
30 #define VTD_CTX_TTYPE_MLP_UNTRANS 0x00000000
32 #define VTD_CTX_AGAW_39 0x00000001
33 #define VTD_CTX_AGAW_48 0x00000002
34 #define VTD_CTX_DID_SHIFT 8
41 #define VTD_PAGE_READ 0x00000001
42 #define VTD_PAGE_WRITE 0x00000002
44 #define VTD_MAX_PAGE_TABLE_LEVELS 4
46 #define VTD_VER_REG 0x00
47 # define VTD_VER_MASK BIT_MASK(7, 0)
48 # define VTD_VER_MIN 0x10
49 #define VTD_CAP_REG 0x08
50 # define VTD_CAP_NUM_DID_MASK BIT_MASK(2, 0)
51 # define VTD_CAP_CM (1UL << 7)
52 # define VTD_CAP_SAGAW39 (1UL << 9)
53 # define VTD_CAP_SAGAW48 (1UL << 10)
54 # define VTD_CAP_SLLPS2M (1UL << 34)
55 # define VTD_CAP_SLLPS1G (1UL << 35)
56 # define VTD_CAP_FRO_MASK BIT_MASK(33, 24)
57 # define VTD_CAP_NFR_MASK BIT_MASK(47, 40)
58 #define VTD_ECAP_REG 0x10
59 # define VTD_ECAP_QI (1UL << 1)
60 # define VTD_ECAP_IR (1UL << 3)
61 # define VTD_ECAP_EIM (1UL << 4)
62 #define VTD_GCMD_REG 0x18
63 # define VTD_GCMD_SIRTP (1UL << 24)
64 # define VTD_GCMD_IRE (1UL << 25)
65 # define VTD_GCMD_QIE (1UL << 26)
66 # define VTD_GCMD_SRTP (1UL << 30)
67 # define VTD_GCMD_TE (1UL << 31)
68 #define VTD_GSTS_REG 0x1c
69 # define VTD_GSTS_IRES (1UL << 25)
70 # define VTD_GSTS_QIES (1UL << 26)
71 # define VTD_GSTS_TES (1UL << 31)
72 # define VTD_GSTS_USED_CTRLS \
73 (VTD_GSTS_IRES | VTD_GSTS_QIES | VTD_GSTS_TES)
74 #define VTD_RTADDR_REG 0x20
75 #define VTD_FSTS_REG 0x34
76 # define VTD_FSTS_PFO (1UL << 0)
77 # define VTD_FSTS_PFO_CLEAR 1
78 # define VTD_FSTS_PPF (1UL << 1)
79 # define VTD_FSTS_FRI_MASK BIT_MASK(15, 8)
80 #define VTD_FECTL_REG 0x38
81 #define VTD_FECTL_IM (1UL << 31)
82 #define VTD_FEDATA_REG 0x3c
83 #define VTD_FEADDR_REG 0x40
84 #define VTD_FEUADDR_REG 0x44
85 #define VTD_IQH_REG 0x80
86 # define VTD_IQH_QH_SHIFT 4
87 #define VTD_IQT_REG 0x88
88 # define VTD_IQT_QT_MASK BIT_MASK(18, 4)
89 #define VTD_IQA_REG 0x90
90 # define VTD_IQA_ADDR_MASK BIT_MASK(63, 12)
91 #define VTD_IRTA_REG 0xb8
92 # define VTD_IRTA_SIZE_MASK BIT_MASK(3, 0)
93 # define VTD_IRTA_EIME (1UL << 11)
94 # define VTD_IRTA_ADDR_MASK BIT_MASK(63, 12)
96 #define VTD_REQ_INV_MASK BIT_MASK(3, 0)
98 #define VTD_REQ_INV_CONTEXT 0x01
99 # define VTD_INV_CONTEXT_GLOBAL (1UL << 4)
100 # define VTD_INV_CONTEXT_DOMAIN (2UL << 4)
101 # define VTD_INV_CONTEXT_DOMAIN_SHIFT 16
103 #define VTD_REQ_INV_IOTLB 0x02
104 # define VTD_INV_IOTLB_GLOBAL (1UL << 4)
105 # define VTD_INV_IOTLB_DOMAIN (2UL << 4)
106 # define VTD_INV_IOTLB_DW (1UL << 6)
107 # define VTD_INV_IOTLB_DR (1UL << 7)
108 # define VTD_INV_IOTLB_DOMAIN_SHIFT 16
110 #define VTD_REQ_INV_INT 0x04
111 # define VTD_INV_INT_GLOBAL (0UL << 4)
112 # define VTD_INV_INT_INDEX (1UL << 4)
113 # define VTD_INV_INT_IM_MASK BIT_MASK(31, 27)
114 # define VTD_INV_INT_IM_SHIFT 27
115 # define VTD_INV_INT_IIDX_MASK BIT_MASK(47, 32)
116 # define VTD_INV_INT_IIDX_SHIFT 32
118 #define VTD_REQ_INV_WAIT 0x05
119 #define VTD_INV_WAIT_IF (1UL << 4)
120 #define VTD_INV_WAIT_SW (1UL << 5)
121 #define VTD_INV_WAIT_FN (1UL << 6)
122 #define VTD_INV_WAIT_SDATA_SHIFT 32
124 #define VTD_FRCD_LO_REG 0x0
125 #define VTD_FRCD_LO_FI_MASK BIT_MASK(63, 12)
126 #define VTD_FRCD_HI_REG 0x8
127 #define VTD_FRCD_HI_SID_MASK BIT_MASK(79-64, 64-64)
128 #define VTD_FRCD_HI_FR_MASK BIT_MASK(103-64, 96-64)
129 #define VTD_FRCD_HI_TYPE (1L << (126-64))
130 #define VTD_FRCD_HI_F (1L << (127-64))
131 #define VTD_FRCD_HI_F_CLEAR 1
139 u8 level_triggered:1;
151 } __attribute__((packed)) field;
153 } __attribute__((packed));
155 #define VTD_IRTE_SQ_VERIFY_FULL_SID 0x0
156 #define VTD_IRTE_SVT_VERIFY_SID_SQ 0x1
158 /* A unit can occupy up to 3 pages for registers, we reserve 4. */
159 #define DMAR_MMIO_SIZE (PAGE_SIZE * 4)
161 struct vtd_irte_usage {
165 } __attribute__((packed));
167 struct vtd_emulation {
169 unsigned int irt_entries;
170 struct vtd_irte_usage *irte_map;
180 static const struct vtd_entry inv_global_context = {
181 .lo_word = VTD_REQ_INV_CONTEXT | VTD_INV_CONTEXT_GLOBAL,
183 static const struct vtd_entry inv_global_iotlb = {
184 .lo_word = VTD_REQ_INV_IOTLB | VTD_INV_IOTLB_GLOBAL |
185 VTD_INV_IOTLB_DW | VTD_INV_IOTLB_DR,
187 static const struct vtd_entry inv_global_int = {
188 .lo_word = VTD_REQ_INV_INT | VTD_INV_INT_GLOBAL,
191 /* TODO: Support multiple segments */
192 static struct vtd_entry __attribute__((aligned(PAGE_SIZE)))
193 root_entry_table[256];
194 static union vtd_irte *int_remap_table;
195 static unsigned int int_remap_table_size_log2;
196 static struct paging vtd_paging[VTD_MAX_PAGE_TABLE_LEVELS];
197 static void *dmar_reg_base;
198 static void *unit_inv_queue;
199 static unsigned int dmar_units;
200 static unsigned int dmar_pt_levels;
201 static unsigned int dmar_num_did = ~0U;
202 static unsigned int fault_reporting_cpu_id;
203 static DEFINE_SPINLOCK(inv_queue_lock);
204 static struct vtd_emulation root_cell_units[JAILHOUSE_MAX_IOMMU_UNITS];
205 static bool dmar_units_initialized;
207 unsigned int iommu_mmio_count_regions(struct cell *cell)
209 return cell == &root_cell ? iommu_count_units() : 0;
212 static unsigned int inv_queue_write(void *inv_queue, unsigned int index,
213 struct vtd_entry content)
215 struct vtd_entry *entry = inv_queue;
217 entry[index] = content;
218 arch_paging_flush_cpu_caches(&entry[index], sizeof(*entry));
220 return (index + 1) % (PAGE_SIZE / sizeof(*entry));
223 static void vtd_submit_iq_request(void *reg_base, void *inv_queue,
224 const struct vtd_entry *inv_request)
226 volatile u32 completed = 0;
227 struct vtd_entry inv_wait = {
228 .lo_word = VTD_REQ_INV_WAIT | VTD_INV_WAIT_SW |
229 VTD_INV_WAIT_FN | (1UL << VTD_INV_WAIT_SDATA_SHIFT),
230 .hi_word = paging_hvirt2phys(&completed),
234 spin_lock(&inv_queue_lock);
236 index = mmio_read64_field(reg_base + VTD_IQT_REG, VTD_IQT_QT_MASK);
239 index = inv_queue_write(inv_queue, index, *inv_request);
240 index = inv_queue_write(inv_queue, index, inv_wait);
242 mmio_write64_field(reg_base + VTD_IQT_REG, VTD_IQT_QT_MASK, index);
247 spin_unlock(&inv_queue_lock);
250 static void vtd_flush_domain_caches(unsigned int did)
252 const struct vtd_entry inv_context = {
253 .lo_word = VTD_REQ_INV_CONTEXT | VTD_INV_CONTEXT_DOMAIN |
254 (did << VTD_INV_CONTEXT_DOMAIN_SHIFT),
256 const struct vtd_entry inv_iotlb = {
257 .lo_word = VTD_REQ_INV_IOTLB | VTD_INV_IOTLB_DOMAIN |
258 VTD_INV_IOTLB_DW | VTD_INV_IOTLB_DR |
259 (did << VTD_INV_IOTLB_DOMAIN_SHIFT),
261 void *inv_queue = unit_inv_queue;
262 void *reg_base = dmar_reg_base;
265 for (n = 0; n < dmar_units; n++) {
266 vtd_submit_iq_request(reg_base, inv_queue, &inv_context);
267 vtd_submit_iq_request(reg_base, inv_queue, &inv_iotlb);
268 reg_base += DMAR_MMIO_SIZE;
269 inv_queue += PAGE_SIZE;
273 static void vtd_update_gcmd_reg(void *reg_base, u32 mask, unsigned int set)
275 u32 val = mmio_read32(reg_base + VTD_GSTS_REG) & VTD_GSTS_USED_CTRLS;
281 mmio_write32(reg_base + VTD_GCMD_REG, val);
283 /* Note: This test is built on the fact related bits are at the same
284 * position in VTD_GCMD_REG and VTD_GSTS_REG. */
285 while ((mmio_read32(reg_base + VTD_GSTS_REG) & mask) != (val & mask))
289 static void vtd_set_next_pt(pt_entry_t pte, unsigned long next_pt)
291 *pte = (next_pt & BIT_MASK(51, 12)) | VTD_PAGE_READ | VTD_PAGE_WRITE;
294 static void vtd_init_fault_nmi(void)
296 union x86_msi_vector msi = { .native.address = MSI_ADDRESS_VALUE };
297 void *reg_base = dmar_reg_base;
298 struct per_cpu *cpu_data;
301 /* This assumes that at least one bit is set somewhere because we
302 * don't support configurations where Linux is left with no CPUs. */
303 for (n = 0; root_cell.cpu_set->bitmap[n] == 0; n++)
305 cpu_data = per_cpu(ffsl(root_cell.cpu_set->bitmap[n]));
307 /* We only support 8-bit APIC IDs. */
308 msi.native.destination = (u8)cpu_data->apic_id;
310 /* Save this value globally to avoid multiple reports of the same
311 * case from different CPUs */
312 fault_reporting_cpu_id = cpu_data->cpu_id;
314 for (n = 0; n < dmar_units; n++, reg_base += DMAR_MMIO_SIZE) {
316 mmio_write32_field(reg_base + VTD_FECTL_REG, VTD_FECTL_IM, 1);
319 * VT-d spec rev. 2.3 section 7.4 suggests that only reading
320 * back FSTS or FECTL ensures no interrupt messages are still
321 * in-flight when we change their destination below.
323 mmio_read32(reg_base + VTD_FECTL_REG);
325 /* Program MSI message to send NMIs to the target CPU */
326 mmio_write32(reg_base + VTD_FEDATA_REG, MSI_DM_NMI);
327 mmio_write32(reg_base + VTD_FEADDR_REG, (u32)msi.raw.address);
328 mmio_write32(reg_base + VTD_FEUADDR_REG, 0);
331 mmio_write32_field(reg_base + VTD_FECTL_REG, VTD_FECTL_IM, 0);
335 * There is a race window between setting the new reporting CPU ID and
336 * updating the target programming in the register. If a fault hits us
337 * in this window and no other NMIs arrive after that, the event will
338 * not be reported. Address this by triggering an NMI on the new
341 apic_send_nmi_ipi(cpu_data);
344 static void *vtd_get_fault_rec_reg_addr(void *reg_base)
346 return reg_base + 16 *
347 mmio_read64_field(reg_base + VTD_CAP_REG, VTD_CAP_FRO_MASK);
350 static void vtd_print_fault_record_reg_status(unsigned int unit_no,
353 unsigned int sid = mmio_read64_field(reg_base + VTD_FRCD_HI_REG,
354 VTD_FRCD_HI_SID_MASK);
355 unsigned int fr = mmio_read64_field(reg_base + VTD_FRCD_HI_REG,
356 VTD_FRCD_HI_FR_MASK);
357 unsigned long fi = mmio_read64_field(reg_base + VTD_FRCD_LO_REG,
358 VTD_FRCD_LO_FI_MASK);
359 unsigned int type = mmio_read64_field(reg_base + VTD_FRCD_HI_REG,
362 printk("VT-d fault event reported by IOMMU %d:\n", unit_no);
363 printk(" Source Identifier (bus:dev.func): %02x:%02x.%x\n",
364 PCI_BDF_PARAMS(sid));
365 printk(" Fault Reason: 0x%x Fault Info: %lx Type %d\n", fr, fi, type);
368 void iommu_check_pending_faults(void)
370 unsigned int fr_index;
371 void *reg_base = dmar_reg_base;
373 void *fault_reg_addr, *rec_reg_addr;
375 if (this_cpu_id() != fault_reporting_cpu_id)
378 for (n = 0; n < dmar_units; n++, reg_base += DMAR_MMIO_SIZE)
379 if (mmio_read32_field(reg_base + VTD_FSTS_REG, VTD_FSTS_PPF)) {
380 fr_index = mmio_read32_field(reg_base + VTD_FSTS_REG,
382 fault_reg_addr = vtd_get_fault_rec_reg_addr(reg_base);
383 rec_reg_addr = fault_reg_addr + 16 * fr_index;
384 vtd_print_fault_record_reg_status(n, rec_reg_addr);
386 /* Clear faults in record registers */
387 mmio_write64_field(rec_reg_addr + VTD_FRCD_HI_REG,
388 VTD_FRCD_HI_F, VTD_FRCD_HI_F_CLEAR);
392 static int vtd_emulate_inv_int(unsigned int unit_no, unsigned int index)
394 struct vtd_irte_usage *irte_usage;
395 struct apic_irq_message irq_msg;
396 struct pci_device *device;
398 if (index >= root_cell_units[unit_no].irt_entries)
400 irte_usage = &root_cell_units[unit_no].irte_map[index];
401 if (!irte_usage->used)
404 device = pci_get_assigned_device(&root_cell, irte_usage->device_id);
405 if (device && device->info->type == JAILHOUSE_PCI_TYPE_IVSHMEM)
406 return pci_ivshmem_update_msix(device);
408 irq_msg = iommu_get_remapped_root_int(unit_no, irte_usage->device_id,
409 irte_usage->vector, index);
410 return iommu_map_interrupt(&root_cell, irte_usage->device_id,
411 irte_usage->vector, irq_msg);
414 static int vtd_emulate_qi_request(unsigned int unit_no,
415 struct vtd_entry inv_desc)
417 unsigned int start, count, n;
421 switch (inv_desc.lo_word & VTD_REQ_INV_MASK) {
422 case VTD_REQ_INV_INT:
423 if (inv_desc.lo_word & VTD_INV_INT_INDEX) {
424 start = (inv_desc.lo_word & VTD_INV_INT_IIDX_MASK) >>
425 VTD_INV_INT_IIDX_SHIFT;
427 1 << ((inv_desc.lo_word & VTD_INV_INT_IM_MASK) >>
428 VTD_INV_INT_IM_SHIFT);
431 count = root_cell_units[unit_no].irt_entries;
433 for (n = start; n < start + count; n++) {
434 result = vtd_emulate_inv_int(unit_no, n);
439 case VTD_REQ_INV_WAIT:
440 if (inv_desc.lo_word & VTD_INV_WAIT_IF ||
441 !(inv_desc.lo_word & VTD_INV_WAIT_SW))
444 status_page = paging_get_guest_pages(NULL, inv_desc.hi_word, 1,
449 *(u32 *)(status_page + (inv_desc.hi_word & ~PAGE_MASK)) =
450 inv_desc.lo_word >> 32;
457 static enum mmio_result vtd_unit_access_handler(void *arg,
458 struct mmio_access *mmio)
460 struct vtd_emulation *unit = arg;
461 unsigned int unit_no = unit - root_cell_units;
462 struct vtd_entry inv_desc;
465 if (mmio->address == VTD_FSTS_REG && !mmio->is_write) {
467 * Nothing to report this way, vtd_check_pending_faults takes
468 * care for the whole system.
473 if (mmio->address == VTD_IQT_REG && mmio->is_write) {
474 while (unit->iqh != (mmio->value & ~PAGE_MASK)) {
476 paging_get_guest_pages(NULL, unit->iqa, 1,
477 PAGE_READONLY_FLAGS);
479 goto invalid_iq_entry;
482 *(struct vtd_entry *)(inv_desc_page + unit->iqh);
484 if (vtd_emulate_qi_request(unit_no, inv_desc) != 0)
485 goto invalid_iq_entry;
487 unit->iqh += 1 << VTD_IQH_QH_SHIFT;
488 unit->iqh &= ~PAGE_MASK;
492 panic_printk("FATAL: Unhandled DMAR unit %s access, register %02x\n",
493 mmio->is_write ? "write" : "read", mmio->address);
497 panic_printk("FATAL: Invalid/unsupported invalidation queue entry\n");
501 static void vtd_init_unit(void *reg_base, void *inv_queue)
503 void *fault_reg_base;
506 /* Disabled QI and IR in case it was already on */
507 vtd_update_gcmd_reg(reg_base, VTD_GCMD_QIE, 0);
508 vtd_update_gcmd_reg(reg_base, VTD_GCMD_IRE, 0);
510 nfr = mmio_read64_field(reg_base + VTD_CAP_REG, VTD_CAP_NFR_MASK);
511 fault_reg_base = vtd_get_fault_rec_reg_addr(reg_base);
513 for (n = 0; n < nfr; n++)
514 /* Clear fault recording register status */
515 mmio_write64_field(fault_reg_base + 16 * n + VTD_FRCD_HI_REG,
516 VTD_FRCD_HI_F, VTD_FRCD_HI_F_CLEAR);
518 /* Clear fault overflow status */
519 mmio_write32_field(reg_base + VTD_FSTS_REG, VTD_FSTS_PFO,
522 /* Set root entry table pointer */
523 mmio_write64(reg_base + VTD_RTADDR_REG,
524 paging_hvirt2phys(root_entry_table));
525 vtd_update_gcmd_reg(reg_base, VTD_GCMD_SRTP, 1);
527 /* Set interrupt remapping table pointer */
528 mmio_write64(reg_base + VTD_IRTA_REG,
529 paging_hvirt2phys(int_remap_table) |
530 (using_x2apic ? VTD_IRTA_EIME : 0) |
531 (int_remap_table_size_log2 - 1));
532 vtd_update_gcmd_reg(reg_base, VTD_GCMD_SIRTP, 1);
534 /* Setup and activate invalidation queue */
535 mmio_write64(reg_base + VTD_IQT_REG, 0);
536 mmio_write64(reg_base + VTD_IQA_REG, paging_hvirt2phys(inv_queue));
537 vtd_update_gcmd_reg(reg_base, VTD_GCMD_QIE, 1);
539 vtd_submit_iq_request(reg_base, inv_queue, &inv_global_context);
540 vtd_submit_iq_request(reg_base, inv_queue, &inv_global_iotlb);
541 vtd_submit_iq_request(reg_base, inv_queue, &inv_global_int);
543 vtd_update_gcmd_reg(reg_base, VTD_GCMD_TE, 1);
544 vtd_update_gcmd_reg(reg_base, VTD_GCMD_IRE, 1);
547 static int vtd_init_ir_emulation(unsigned int unit_no, void *reg_base)
549 struct vtd_emulation *unit = &root_cell_units[unit_no];
550 unsigned long base, size;
553 root_cell.arch.vtd.ir_emulation = true;
555 base = system_config->platform_info.x86.iommu_units[unit_no].base;
556 mmio_region_register(&root_cell, base, PAGE_SIZE,
557 vtd_unit_access_handler, unit);
559 unit->irta = mmio_read64(reg_base + VTD_IRTA_REG);
560 unit->irt_entries = 2 << (unit->irta & VTD_IRTA_SIZE_MASK);
562 size = PAGE_ALIGN(sizeof(struct vtd_irte_usage) * unit->irt_entries);
563 unit->irte_map = page_alloc(&mem_pool, size / PAGE_SIZE);
567 iqt = mmio_read64(reg_base + VTD_IQT_REG);
568 while (mmio_read64(reg_base + VTD_IQH_REG) != iqt)
572 unit->iqa = mmio_read64(reg_base + VTD_IQA_REG);
573 if (unit->iqa & ~VTD_IQA_ADDR_MASK)
574 return trace_error(-EIO);
576 unit->fectl = mmio_read32(reg_base + VTD_FECTL_REG);
577 unit->fedata = mmio_read32(reg_base + VTD_FEDATA_REG);
578 unit->feaddr = mmio_read32(reg_base + VTD_FEADDR_REG);
579 unit->feuaddr = mmio_read32(reg_base + VTD_FEUADDR_REG);
586 unsigned long version, caps, ecaps, ctrls, sllps_caps = ~0UL;
587 unsigned int units, pt_levels, num_did, n;
588 struct jailhouse_iommu *unit;
592 /* n = roundup(log2(system_config->interrupt_limit)) */
593 for (n = 0; (1UL << n) < (system_config->interrupt_limit); n++)
596 return trace_error(-EINVAL);
599 page_alloc(&mem_pool, PAGES(sizeof(union vtd_irte) << n));
600 if (!int_remap_table)
603 int_remap_table_size_log2 = n;
605 units = iommu_count_units();
607 return trace_error(-EINVAL);
609 dmar_reg_base = page_alloc(&remap_pool, units * PAGES(DMAR_MMIO_SIZE));
611 return trace_error(-ENOMEM);
613 unit_inv_queue = page_alloc(&mem_pool, units);
617 for (n = 0; n < units; n++) {
618 unit = &system_config->platform_info.x86.iommu_units[n];
620 reg_base = dmar_reg_base + n * DMAR_MMIO_SIZE;
622 err = paging_create(&hv_paging_structs, unit->base, unit->size,
623 (unsigned long)reg_base,
624 PAGE_DEFAULT_FLAGS | PAGE_FLAG_DEVICE,
625 PAGING_NON_COHERENT);
629 version = mmio_read64(reg_base + VTD_VER_REG) & VTD_VER_MASK;
630 if (version < VTD_VER_MIN || version == 0xff) {
633 printk("WARNING: No VT-d support found!\n");
637 printk("DMAR unit @0x%lx/0x%x\n", unit->base, unit->size);
639 caps = mmio_read64(reg_base + VTD_CAP_REG);
640 if (caps & VTD_CAP_SAGAW39)
642 else if (caps & VTD_CAP_SAGAW48)
645 return trace_error(-EIO);
648 if (dmar_pt_levels > 0 && dmar_pt_levels != pt_levels)
649 return trace_error(-EIO);
650 dmar_pt_levels = pt_levels;
652 if (caps & VTD_CAP_CM)
653 return trace_error(-EIO);
655 ecaps = mmio_read64(reg_base + VTD_ECAP_REG);
656 if (!(ecaps & VTD_ECAP_QI) || !(ecaps & VTD_ECAP_IR) ||
657 (using_x2apic && !(ecaps & VTD_ECAP_EIM)))
658 return trace_error(-EIO);
660 ctrls = mmio_read32(reg_base + VTD_GSTS_REG) &
663 if (ctrls != (VTD_GSTS_IRES | VTD_GSTS_QIES))
664 return trace_error(-EBUSY);
665 err = vtd_init_ir_emulation(n, reg_base);
668 } else if (root_cell.arch.vtd.ir_emulation) {
669 /* IR+QI must be either on or off in all units */
670 return trace_error(-EIO);
673 num_did = 1 << (4 + (caps & VTD_CAP_NUM_DID_MASK) * 2);
674 if (num_did < dmar_num_did)
675 dmar_num_did = num_did;
681 * Derive vdt_paging from very similar x86_64_paging,
682 * replicating 0..3 for 4 levels and 1..3 for 3 levels.
684 memcpy(vtd_paging, &x86_64_paging[4 - dmar_pt_levels],
685 sizeof(struct paging) * dmar_pt_levels);
686 for (n = 0; n < dmar_pt_levels; n++)
687 vtd_paging[n].set_next_pt = vtd_set_next_pt;
688 if (!(sllps_caps & VTD_CAP_SLLPS1G))
689 vtd_paging[dmar_pt_levels - 3].page_size = 0;
690 if (!(sllps_caps & VTD_CAP_SLLPS2M))
691 vtd_paging[dmar_pt_levels - 2].page_size = 0;
693 return iommu_cell_init(&root_cell);
696 static void vtd_update_irte(unsigned int index, union vtd_irte content)
698 const struct vtd_entry inv_int = {
699 .lo_word = VTD_REQ_INV_INT | VTD_INV_INT_INDEX |
700 ((u64)index << VTD_INV_INT_IIDX_SHIFT),
702 union vtd_irte *irte = &int_remap_table[index];
703 void *inv_queue = unit_inv_queue;
704 void *reg_base = dmar_reg_base;
707 if (content.field.p) {
709 * Write upper half first to preserve non-presence.
710 * If the entry was present before, we are only modifying the
711 * lower half's content (destination etc.), so writing the
712 * upper half becomes a nop and is safely done first.
714 irte->raw[1] = content.raw[1];
716 irte->raw[0] = content.raw[0];
719 * Write only lower half - we are clearing presence and
722 irte->raw[0] = content.raw[0];
724 arch_paging_flush_cpu_caches(irte, sizeof(*irte));
726 for (n = 0; n < dmar_units; n++) {
727 vtd_submit_iq_request(reg_base, inv_queue, &inv_int);
728 reg_base += DMAR_MMIO_SIZE;
729 inv_queue += PAGE_SIZE;
733 static int vtd_find_int_remap_region(u16 device_id)
737 /* interrupt_limit is < 2^16, see vtd_init */
738 for (n = 0; n < system_config->interrupt_limit; n++)
739 if (int_remap_table[n].field.assigned &&
740 int_remap_table[n].field.sid == device_id)
746 static int vtd_reserve_int_remap_region(u16 device_id, unsigned int length)
748 int n, start = -E2BIG;
750 if (length == 0 || vtd_find_int_remap_region(device_id) >= 0)
753 for (n = 0; n < system_config->interrupt_limit; n++) {
754 if (int_remap_table[n].field.assigned) {
760 if (n + 1 == start + length) {
761 printk("Reserving %u interrupt(s) for device %04x "
762 "at index %d\n", length, device_id, start);
763 for (n = start; n < start + length; n++) {
764 int_remap_table[n].field.assigned = 1;
765 int_remap_table[n].field.sid = device_id;
770 return trace_error(-E2BIG);
773 static void vtd_free_int_remap_region(u16 device_id, unsigned int length)
775 union vtd_irte free_irte = { .field.p = 0, .field.assigned = 0 };
776 int pos = vtd_find_int_remap_region(device_id);
779 printk("Freeing %u interrupt(s) for device %04x at index %d\n",
780 length, device_id, pos);
782 vtd_update_irte(pos++, free_irte);
786 int iommu_add_pci_device(struct cell *cell, struct pci_device *device)
788 unsigned int max_vectors = MAX(device->info->num_msi_vectors,
789 device->info->num_msix_vectors);
790 u16 bdf = device->info->bdf;
791 u64 *root_entry_lo = &root_entry_table[PCI_BUS(bdf)].lo_word;
792 struct vtd_entry *context_entry_table, *context_entry;
799 result = vtd_reserve_int_remap_region(bdf, max_vectors);
803 if (*root_entry_lo & VTD_ROOT_PRESENT) {
804 context_entry_table =
805 paging_phys2hvirt(*root_entry_lo & PAGE_MASK);
807 context_entry_table = page_alloc(&mem_pool, 1);
808 if (!context_entry_table)
810 *root_entry_lo = VTD_ROOT_PRESENT |
811 paging_hvirt2phys(context_entry_table);
812 arch_paging_flush_cpu_caches(root_entry_lo, sizeof(u64));
815 context_entry = &context_entry_table[PCI_DEVFN(bdf)];
816 context_entry->lo_word = VTD_CTX_PRESENT | VTD_CTX_TTYPE_MLP_UNTRANS |
817 paging_hvirt2phys(cell->arch.vtd.pg_structs.root_table);
818 context_entry->hi_word =
819 (dmar_pt_levels == 3 ? VTD_CTX_AGAW_39 : VTD_CTX_AGAW_48) |
820 (cell->id << VTD_CTX_DID_SHIFT);
821 arch_paging_flush_cpu_caches(context_entry, sizeof(*context_entry));
826 vtd_free_int_remap_region(bdf, max_vectors);
830 void iommu_remove_pci_device(struct pci_device *device)
832 u16 bdf = device->info->bdf;
833 u64 *root_entry_lo = &root_entry_table[PCI_BUS(bdf)].lo_word;
834 struct vtd_entry *context_entry_table;
835 struct vtd_entry *context_entry;
842 vtd_free_int_remap_region(bdf, MAX(device->info->num_msi_vectors,
843 device->info->num_msix_vectors));
845 context_entry_table = paging_phys2hvirt(*root_entry_lo & PAGE_MASK);
846 context_entry = &context_entry_table[PCI_DEVFN(bdf)];
848 context_entry->lo_word &= ~VTD_CTX_PRESENT;
849 arch_paging_flush_cpu_caches(&context_entry->lo_word, sizeof(u64));
851 for (n = 0; n < 256; n++)
852 if (context_entry_table[n].lo_word & VTD_CTX_PRESENT)
855 *root_entry_lo &= ~VTD_ROOT_PRESENT;
856 arch_paging_flush_cpu_caches(root_entry_lo, sizeof(u64));
857 page_free(&mem_pool, context_entry_table, 1);
860 int iommu_cell_init(struct cell *cell)
862 const struct jailhouse_irqchip *irqchip =
863 jailhouse_cell_irqchips(cell->config);
871 if (cell->id >= dmar_num_did)
872 return trace_error(-ERANGE);
874 cell->arch.vtd.pg_structs.root_paging = vtd_paging;
875 cell->arch.vtd.pg_structs.root_table = page_alloc(&mem_pool, 1);
876 if (!cell->arch.vtd.pg_structs.root_table)
879 /* reserve regions for IRQ chips (if not done already) */
880 for (n = 0; n < cell->config->num_irqchips; n++, irqchip++) {
881 result = vtd_reserve_int_remap_region(irqchip->id,
884 iommu_cell_exit(cell);
892 int iommu_map_memory_region(struct cell *cell,
893 const struct jailhouse_memory *mem)
901 if (!(mem->flags & JAILHOUSE_MEM_DMA))
904 if (mem->virt_start & BIT_MASK(63, 12 + 9 * dmar_pt_levels))
905 return trace_error(-E2BIG);
907 if (mem->flags & JAILHOUSE_MEM_READ)
908 flags |= VTD_PAGE_READ;
909 if (mem->flags & JAILHOUSE_MEM_WRITE)
910 flags |= VTD_PAGE_WRITE;
912 return paging_create(&cell->arch.vtd.pg_structs, mem->phys_start,
913 mem->size, mem->virt_start, flags,
917 int iommu_unmap_memory_region(struct cell *cell,
918 const struct jailhouse_memory *mem)
924 if (!(mem->flags & JAILHOUSE_MEM_DMA))
927 return paging_destroy(&cell->arch.vtd.pg_structs, mem->virt_start,
928 mem->size, PAGING_COHERENT);
931 struct apic_irq_message
932 iommu_get_remapped_root_int(unsigned int iommu, u16 device_id,
933 unsigned int vector, unsigned int remap_index)
935 struct vtd_emulation *unit = &root_cell_units[iommu];
936 struct apic_irq_message irq_msg = { .valid = 0 };
937 union vtd_irte root_irte;
938 unsigned long irte_addr;
941 if (remap_index >= unit->irt_entries)
943 unit->irte_map[remap_index].used = 0;
945 irte_addr = (unit->irta & VTD_IRTA_ADDR_MASK) +
946 remap_index * sizeof(union vtd_irte);
947 irte_page = paging_get_guest_pages(NULL, irte_addr, 1,
948 PAGE_READONLY_FLAGS);
952 root_irte = *(union vtd_irte *)(irte_page + (irte_addr & ~PAGE_MASK));
955 (root_irte.field.p && root_irte.field.sid == device_id);
956 irq_msg.vector = root_irte.field.vector;
957 irq_msg.delivery_mode = root_irte.field.delivery_mode;
958 irq_msg.dest_logical = root_irte.field.dest_logical;
959 irq_msg.level_triggered = root_irte.field.level_triggered;
960 irq_msg.redir_hint = root_irte.field.redir_hint;
961 irq_msg.destination = root_irte.field.destination;
963 /* xAPIC in flat mode: APIC ID in 47:40 (of 63:32) */
964 irq_msg.destination >>= 8;
966 unit->irte_map[remap_index].device_id = device_id;
967 unit->irte_map[remap_index].vector = vector;
968 unit->irte_map[remap_index].used = 1;
973 int iommu_map_interrupt(struct cell *cell, u16 device_id, unsigned int vector,
974 struct apic_irq_message irq_msg)
983 base_index = vtd_find_int_remap_region(device_id);
987 if (vector >= system_config->interrupt_limit ||
988 base_index >= system_config->interrupt_limit - vector)
991 irte = int_remap_table[base_index + vector];
992 if (!irte.field.assigned || irte.field.sid != device_id)
995 irte.field.p = irq_msg.valid;
998 * Do not validate non-present entries, they may contain
999 * invalid data and cause false-positives.
1004 * Validate delivery mode and destination(s).
1005 * Note that we do support redirection hint only in logical
1008 if ((irq_msg.delivery_mode != APIC_MSG_DLVR_FIXED &&
1009 irq_msg.delivery_mode != APIC_MSG_DLVR_LOWPRI) ||
1010 irq_msg.dest_logical != irq_msg.redir_hint)
1012 if (!apic_filter_irq_dest(cell, &irq_msg))
1015 irte.field.dest_logical = irq_msg.dest_logical;
1016 irte.field.redir_hint = irq_msg.redir_hint;
1017 irte.field.level_triggered = irq_msg.level_triggered;
1018 irte.field.delivery_mode = irq_msg.delivery_mode;
1019 irte.field.vector = irq_msg.vector;
1020 irte.field.destination = irq_msg.destination;
1022 /* xAPIC in flat mode: APIC ID in 47:40 (of 63:32) */
1023 irte.field.destination <<= 8;
1024 irte.field.sq = VTD_IRTE_SQ_VERIFY_FULL_SID;
1025 irte.field.svt = VTD_IRTE_SVT_VERIFY_SID_SQ;
1028 vtd_update_irte(base_index + vector, irte);
1030 return base_index + vector;
1033 void iommu_cell_exit(struct cell *cell)
1036 if (dmar_units == 0)
1039 page_free(&mem_pool, cell->arch.vtd.pg_structs.root_table, 1);
1042 * Note that reservation regions of IOAPICs won't be released because
1043 * they might be shared with other cells
1047 void iommu_config_commit(struct cell *cell_added_removed)
1049 void *inv_queue = unit_inv_queue;
1050 void *reg_base = dmar_reg_base;
1054 if (dmar_units == 0)
1057 if (cell_added_removed)
1058 vtd_init_fault_nmi();
1060 if (cell_added_removed == &root_cell) {
1061 for (n = 0; n < dmar_units; n++) {
1062 vtd_init_unit(reg_base, inv_queue);
1063 reg_base += DMAR_MMIO_SIZE;
1064 inv_queue += PAGE_SIZE;
1066 dmar_units_initialized = true;
1068 if (cell_added_removed)
1069 vtd_flush_domain_caches(cell_added_removed->id);
1070 vtd_flush_domain_caches(root_cell.id);
1074 static void vtd_restore_ir(unsigned int unit_no, void *reg_base)
1076 struct vtd_emulation *unit = &root_cell_units[unit_no];
1077 void *inv_queue = unit_inv_queue + unit_no * PAGE_SIZE;
1078 void *root_inv_queue;
1081 mmio_write64(reg_base + VTD_IRTA_REG, unit->irta);
1082 vtd_update_gcmd_reg(reg_base, VTD_GCMD_SIRTP, 1);
1083 vtd_submit_iq_request(reg_base, inv_queue, &inv_global_int);
1085 vtd_update_gcmd_reg(reg_base, VTD_GCMD_QIE, 0);
1086 mmio_write64(reg_base + VTD_IQT_REG, 0);
1087 mmio_write64(reg_base + VTD_IQA_REG, unit->iqa);
1088 vtd_update_gcmd_reg(reg_base, VTD_GCMD_QIE, 1);
1091 * Restore invalidation queue head pointer by issuing dummy requests
1092 * until the hardware is in sync with the Linux state again.
1095 root_inv_queue = paging_get_guest_pages(NULL, unit->iqa, 1,
1096 PAGE_DEFAULT_FLAGS);
1098 while (mmio_read64(reg_base + VTD_IQH_REG) != iqh)
1099 vtd_submit_iq_request(reg_base, root_inv_queue, NULL);
1101 printk("WARNING: Failed to restore invalidation queue head\n");
1103 vtd_update_gcmd_reg(reg_base, VTD_GCMD_IRE, 1);
1105 mmio_write32(reg_base + VTD_FEDATA_REG, unit->fedata);
1106 mmio_write32(reg_base + VTD_FEADDR_REG, unit->feaddr);
1107 mmio_write32(reg_base + VTD_FEUADDR_REG, unit->feuaddr);
1108 mmio_write32(reg_base + VTD_FECTL_REG, unit->fectl);
1111 void iommu_shutdown(void)
1113 void *reg_base = dmar_reg_base;
1116 if (dmar_units_initialized)
1117 for (n = 0; n < dmar_units; n++, reg_base += DMAR_MMIO_SIZE) {
1118 vtd_update_gcmd_reg(reg_base, VTD_GCMD_TE, 0);
1119 vtd_update_gcmd_reg(reg_base, VTD_GCMD_IRE, 0);
1120 if (root_cell.arch.vtd.ir_emulation)
1121 vtd_restore_ir(n, reg_base);
1123 vtd_update_gcmd_reg(reg_base, VTD_GCMD_QIE, 0);
1127 bool iommu_cell_emulates_ir(struct cell *cell)
1129 return cell->arch.vtd.ir_emulation;