]> rtime.felk.cvut.cz Git - jailhouse.git/commitdiff
x86: Emulate interrupt remapping support to enable x2APIC usage
authorJan Kiszka <jan.kiszka@siemens.com>
Tue, 19 Aug 2014 13:47:47 +0000 (15:47 +0200)
committerJan Kiszka <jan.kiszka@siemens.com>
Thu, 28 Aug 2014 06:36:10 +0000 (08:36 +0200)
If we want to use x2APIC on real hardware (virtual machines do not have
this limitation), interrupt remapping has to be enabled. As we take over
hardware control from Linux, we either have to switch the APIC modes on
handover (tricky specifically for x2APIC->xAPIC) or let Linux boot with
interrupt remapping already enable. We choose the latter way as the
cleaner one that also allow us to run Linux without xAPIC emulation
(non-root cells are expected to use the x2APIC unconditionally).

IR emulation requires both the interpretation of the interrupt remapping
table that Linux uses (vtd_get_remapped_root_int) as well as basic
queued invalidation emulation (vtd_emulate_qi_request). We also need to
handle FSTS register reads, but we simply return 0 here and let
Jailhouse report all faults.

Physical address provided by Linux via registers and data structures are
mapped on demand into the hypervisor. This avoids that we create a
static mapping that depends on Linux-controlled parameters (would be bad
for check-summing). We also make sure this way that the addressed memory
still belongs to Linux.

Returning IR and QI to Linux is more complex than stealing it because we
not only have to load overwritten registers with their original values:
the Invalidation Queue Head cannot be set by software. Instead, we need
to inject dummy invalidation wait requests until the hardware reaches
the value Linux expects.

Note that this IR emulation feature is solely designed to be used by the
root cell. Non-root cells have to continue to program the virtualized
interrupt registers of assigned devices.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
README
hypervisor/arch/x86/include/asm/apic.h
hypervisor/arch/x86/include/asm/cell.h
hypervisor/arch/x86/include/asm/vtd.h
hypervisor/arch/x86/ioapic.c
hypervisor/arch/x86/pci.c
hypervisor/arch/x86/vmx.c
hypervisor/arch/x86/vtd.c

diff --git a/README b/README
index a0ad7961cdddd58548eda60b2b23ee42ffc88b0c..9de7f096072f603cd5cf6c5edd1f8bd40389bba0 100644 (file)
--- a/README
+++ b/README
@@ -51,10 +51,13 @@ currently:
    (except when running inside QEMU)
  - at least 2 logical CPUs
  - x86-64 Linux kernel (tested against >= 3.9)
-    - VT-d usage has to be disabled in the Linux kernel, e.g. via command
-      line parameters:
+    - VT-d IOMMU usage (DMAR) has to be disabled in the Linux kernel, e.g. via
+      the command line parameter:
 
-        intel_iommu=off intremap=off
+          intel_iommu=off
+
+    - To exploit the faster x2APIC, interrupt remapping needs to be on in the
+      kernel (check for CONFIG_IRQ_REMAP)
 
 
 Build
index d4af680cedba9acd3909b3be09e3ba05524f3783..e89b8e70db8be8518013a17960e481089819ab87 100644 (file)
@@ -129,6 +129,7 @@ struct apic_irq_message {
        u8 dest_logical:1;
        u8 level_triggered:1;
        u8 redir_hint:1;
+       u8 valid:1;
        u32 destination;
 };
 
index 70afb256ce199ab5ce6c369f889f04af5d75c76c..de822bf436523bcd6496c40d3bad44aaa79c2050 100644 (file)
@@ -36,6 +36,7 @@ struct cell {
 
        struct {
                struct paging_structures pg_structs;
+               bool ir_emulation;
        } vtd;
 
        unsigned int id;
index c9db89770ca66bcbc8bd6c7ce9a6942cc750d226..87df5fa8f58cc8fb5e96ade09301c3d0e6dc2f55 100644 (file)
@@ -57,6 +57,7 @@ struct vtd_entry {
 #define VTD_ECAP_REG                   0x10
 # define VTD_ECAP_QI                   (1UL << 1)
 # define VTD_ECAP_IR                   (1UL << 3)
+# define VTD_ECAP_EIM                  (1UL << 4)
 #define VTD_GCMD_REG                   0x18
 # define VTD_GCMD_SIRTP                        (1UL << 24)
 # define VTD_GCMD_IRE                  (1UL << 25)
@@ -85,10 +86,19 @@ struct vtd_entry {
 #define VTD_PLMLIMIT_REG               0x6c
 #define VTD_PHMBASE_REG                        0x70
 #define VTD_PHMLIMIT_REG               0x78
+#define VTD_IQH_REG                    0x80
+# define VTD_IQH_QH_SHIFT              4
 #define VTD_IQT_REG                    0x88
 # define VTD_IQT_QT_MASK               BIT_MASK(18, 4)
+# define VTD_IQT_QT_SHIFT              4
 #define VTD_IQA_REG                    0x90
+# define VTD_IQA_ADDR_MASK             BIT_MASK(63, 12)
 #define VTD_IRTA_REG                   0xb8
+# define VTD_IRTA_SIZE_MASK            BIT_MASK(3, 0)
+# define VTD_IRTA_EIME                 (1UL << 11)
+# define VTD_IRTA_ADDR_MASK            BIT_MASK(63, 12)
+
+#define VTD_REQ_INV_MASK               BIT_MASK(3, 0)
 
 #define VTD_REQ_INV_CONTEXT            0x01
 # define VTD_INV_CONTEXT_GLOBAL                (1UL << 4)
@@ -105,9 +115,13 @@ struct vtd_entry {
 #define VTD_REQ_INV_INT                        0x04
 # define VTD_INV_INT_GLOBAL            (0UL << 4)
 # define VTD_INV_INT_INDEX             (1UL << 4)
+# define VTD_INV_INT_IM_MASK           BIT_MASK(31, 27)
+# define VTD_INV_INT_IM_SHIFT          27
+# define VTD_INV_INT_IIDX_MASK         BIT_MASK(47, 32)
 # define VTD_INV_INT_IIDX_SHIFT                32
 
 #define VTD_REQ_INV_WAIT               0x05
+#define  VTD_INV_WAIT_IF               (1UL << 4)
 #define  VTD_INV_WAIT_SW               (1UL << 5)
 #define  VTD_INV_WAIT_FN               (1UL << 6)
 #define  VTD_INV_WAIT_SDATA_SHIFT      32
@@ -155,6 +169,9 @@ int vtd_unmap_memory_region(struct cell *cell,
                            const struct jailhouse_memory *mem);
 int vtd_add_pci_device(struct cell *cell, struct pci_device *device);
 void vtd_remove_pci_device(struct pci_device *device);
+struct apic_irq_message
+vtd_get_remapped_root_int(unsigned int iommu, u16 device_id,
+                         unsigned int vector, unsigned int remap_index);
 int vtd_map_interrupt(struct cell *cell, u16 device_id, unsigned int vector,
                      struct apic_irq_message irq_msg);
 void vtd_cell_exit(struct cell *cell);
@@ -164,3 +181,5 @@ void vtd_config_commit(struct cell *cell_added_removed);
 void vtd_shutdown(void);
 
 void vtd_check_pending_faults(struct per_cpu *cpu_data);
+
+int vtd_mmio_access_handler(bool is_write, u64 addr, u32 *value);
index a0b77dd0808e54e2354fe6ba92258f3caebc90d1..426b7b05b6d08264c3c46b3f36da177a8dbcaf6a 100644 (file)
@@ -61,9 +61,22 @@ static void ioapic_reg_write(unsigned int reg, u32 value)
 }
 
 static struct apic_irq_message
-ioapic_translate_redir_entry(union ioapic_redir_entry entry)
+ioapic_translate_redir_entry(struct cell *cell, unsigned int pin,
+                            union ioapic_redir_entry entry)
 {
-       struct apic_irq_message irq_msg;
+       struct apic_irq_message irq_msg = { .valid = 0 };
+       unsigned int idx;
+
+       if (cell->vtd.ir_emulation) {
+               if (!entry.remap.remapped)
+                       return irq_msg;
+
+               idx = entry.remap.int_index | (entry.remap.int_index15 << 15);
+
+               return vtd_get_remapped_root_int(root_cell.ioapic_iommu,
+                                                root_cell.ioapic_id, pin,
+                                                idx);
+       }
 
        irq_msg.vector = entry.native.vector;
        irq_msg.delivery_mode = entry.native.delivery_mode;
@@ -71,6 +84,7 @@ ioapic_translate_redir_entry(union ioapic_redir_entry entry)
        irq_msg.dest_logical = entry.native.dest_logical;
        /* align redir_hint and dest_logical - required by vtd_map_interrupt */
        irq_msg.redir_hint = irq_msg.dest_logical;
+       irq_msg.valid = 1;
        irq_msg.destination = entry.native.destination;
 
        return irq_msg;
@@ -99,7 +113,7 @@ static int ioapic_virt_redir_write(struct cell *cell, unsigned int reg,
                return 0;
        }
 
-       irq_msg = ioapic_translate_redir_entry(entry);
+       irq_msg = ioapic_translate_redir_entry(cell, pin, entry);
 
        result = vtd_map_interrupt(cell, cell->ioapic_id, pin, irq_msg);
        // HACK for QEMU
@@ -119,7 +133,8 @@ static int ioapic_virt_redir_write(struct cell *cell, unsigned int reg,
        return 0;
 }
 
-static void ioapic_mask_pins(u64 pin_bitmap, enum ioapic_handover handover)
+static void ioapic_mask_pins(struct cell *cell, u64 pin_bitmap,
+                            enum ioapic_handover handover)
 {
        union ioapic_redir_entry entry;
        unsigned int pin, reg;
@@ -145,7 +160,8 @@ static void ioapic_mask_pins(u64 pin_bitmap, enum ioapic_handover handover)
                         * interrupts.
                         */
                        entry = shadow_redir_table[pin];
-                       apic_send_irq(ioapic_translate_redir_entry(entry));
+                       apic_send_irq(ioapic_translate_redir_entry(cell, pin,
+                                                                  entry));
                }
        }
 }
@@ -201,9 +217,9 @@ void ioapic_prepare_handover(void)
                return;
        if (irqchip) {
                pin_bitmap = irqchip->pin_bitmap;
-               ioapic_mask_pins(pin_bitmap, PINS_ACTIVE);
+               ioapic_mask_pins(&root_cell, pin_bitmap, PINS_ACTIVE);
        }
-       ioapic_mask_pins(~pin_bitmap, PINS_MASKED);
+       ioapic_mask_pins(&root_cell, ~pin_bitmap, PINS_MASKED);
 }
 
 void ioapic_cell_init(struct cell *cell)
@@ -218,7 +234,8 @@ void ioapic_cell_init(struct cell *cell)
 
                if (cell != &root_cell) {
                        root_cell.ioapic_pin_bitmap &= ~irqchip->pin_bitmap;
-                       ioapic_mask_pins(irqchip->pin_bitmap, PINS_MASKED);
+                       ioapic_mask_pins(cell, irqchip->pin_bitmap,
+                                        PINS_MASKED);
                }
        }
 }
@@ -233,7 +250,7 @@ void ioapic_cell_exit(struct cell *cell)
        if (!cell_irqchip)
                return;
 
-       ioapic_mask_pins(cell_irqchip->pin_bitmap, PINS_MASKED);
+       ioapic_mask_pins(cell, cell_irqchip->pin_bitmap, PINS_MASKED);
        if (root_irqchip)
                root_cell.ioapic_pin_bitmap |= cell_irqchip->pin_bitmap &
                        root_irqchip->pin_bitmap;
index 393795d50fcee92a5d19ad1571fccd0f2c51672f..5604257d66bace648c2f1b0ac439df1fc32ae379 100644 (file)
@@ -243,7 +243,20 @@ static struct apic_irq_message
 pci_translate_msi_vector(struct pci_device *device, unsigned int vector,
                         unsigned int legacy_vectors, union x86_msi_vector msi)
 {
-       struct apic_irq_message irq_msg;
+       struct apic_irq_message irq_msg = { .valid = 0 };
+       unsigned int idx;
+
+       if (device->cell->vtd.ir_emulation) {
+               if (!msi.remap.remapped)
+                       return irq_msg;
+
+               idx = msi.remap.int_index | (msi.remap.int_index15 << 15);
+               if (msi.remap.shv)
+                       idx += msi.remap.subhandle;
+               return vtd_get_remapped_root_int(device->info->iommu,
+                                                device->info->bdf,
+                                                vector, idx);
+       }
 
        irq_msg.vector = msi.native.vector;
        if (legacy_vectors > 1) {
@@ -254,6 +267,7 @@ pci_translate_msi_vector(struct pci_device *device, unsigned int vector,
        irq_msg.level_triggered = 0;
        irq_msg.dest_logical = msi.native.dest_logical;
        irq_msg.redir_hint = msi.native.redir_hint;
+       irq_msg.valid = 1;
        irq_msg.destination = msi.native.destination;
 
        return irq_msg;
index ca5abae2f3fbb892fa4e24d4eb2f975550ea1260..434a95c652c848f733b6c5229345b789fef2d9e3 100644 (file)
@@ -1074,6 +1074,8 @@ static bool vmx_handle_ept_violation(struct registers *guest_regs,
        if (result == 0)
                result = pci_mmio_access_handler(cpu_data->cell, is_write,
                                                 phys_addr, &val);
+       if (result == 0)
+               result = vtd_mmio_access_handler(is_write, phys_addr, &val);
 
        if (result == 1) {
                if (!is_write)
index fb1b83f3502e61320382b7f986e28fe5e726f83e..f741b23b217959adbc60e6b5452f201f7b417070 100644 (file)
 #include <asm/spinlock.h>
 #include <asm/vtd.h>
 
+struct vtd_irte_usage {
+       u16 device_id;
+       u16 vector:10,
+           used:1;
+} __attribute__((packed));
+
+struct vtd_emulation {
+       u64 irta;
+       unsigned int irt_entries;
+       struct vtd_irte_usage *irte_map;
+
+       u64 iqa;
+       u16 iqh;
+
+       u32 fectl;
+       u32 fedata;
+       u32 feaddr, feuaddr;
+};
+
 static const struct vtd_entry inv_global_context = {
        .lo_word = VTD_REQ_INV_CONTEXT | VTD_INV_CONTEXT_GLOBAL,
 };
@@ -45,6 +64,7 @@ static unsigned int dmar_pt_levels;
 static unsigned int dmar_num_did = ~0U;
 static unsigned int fault_reporting_cpu_id;
 static DEFINE_SPINLOCK(inv_queue_lock);
+static struct vtd_emulation root_cell_units[JAILHOUSE_MAX_DMAR_UNITS];
 
 static unsigned int inv_queue_write(void *inv_queue, unsigned int index,
                                    struct vtd_entry content)
@@ -72,7 +92,8 @@ static void vtd_submit_iq_request(void *reg_base, void *inv_queue,
 
        index = mmio_read64_field(reg_base + VTD_IQT_REG, VTD_IQT_QT_MASK);
 
-       index = inv_queue_write(inv_queue, index, *inv_request);
+       if (inv_request)
+               index = inv_queue_write(inv_queue, index, *inv_request);
        index = inv_queue_write(inv_queue, index, inv_wait);
 
        mmio_write64_field(reg_base + VTD_IQT_REG, VTD_IQT_QT_MASK, index);
@@ -209,11 +230,136 @@ void vtd_check_pending_faults(struct per_cpu *cpu_data)
                }
 }
 
+static int vtd_emulate_inv_int(unsigned int unit_no, unsigned int index)
+{
+       struct vtd_irte_usage *irte_usage;
+       struct apic_irq_message irq_msg;
+
+       if (index >= root_cell_units[unit_no].irt_entries)
+               return 0;
+       irte_usage = &root_cell_units[unit_no].irte_map[index];
+       if (!irte_usage->used)
+               return 0;
+
+       irq_msg = vtd_get_remapped_root_int(unit_no, irte_usage->device_id,
+                                           irte_usage->vector, index);
+       return vtd_map_interrupt(&root_cell, irte_usage->device_id,
+                                irte_usage->vector, irq_msg);
+}
+
+static int vtd_emulate_qi_request(unsigned int unit_no,
+                                 struct vtd_entry inv_desc)
+{
+       unsigned int start, count, n;
+       void *status_page;
+       int err;
+
+       switch (inv_desc.lo_word & VTD_REQ_INV_MASK) {
+       case VTD_REQ_INV_INT:
+               if (inv_desc.lo_word & VTD_INV_INT_INDEX) {
+                       start = (inv_desc.lo_word & VTD_INV_INT_IIDX_MASK) >>
+                               VTD_INV_INT_IIDX_SHIFT;
+                       count =
+                           1 << ((inv_desc.lo_word & VTD_INV_INT_IM_MASK) >>
+                                 VTD_INV_INT_IM_SHIFT);
+               } else {
+                       start = 0;
+                       count = root_cell_units[unit_no].irt_entries;
+               }
+               for (n = start; n < start + count; n++) {
+                       err = vtd_emulate_inv_int(unit_no, n);
+                       if (err < 0)
+                               return err;
+               }
+               return 0;
+       case VTD_REQ_INV_WAIT:
+               if (inv_desc.lo_word & VTD_INV_WAIT_IF ||
+                   !(inv_desc.lo_word & VTD_INV_WAIT_SW))
+                       return -EINVAL;
+
+               status_page = page_map_get_guest_pages(NULL, inv_desc.hi_word,
+                                                      1, PAGE_DEFAULT_FLAGS);
+               if (!status_page)
+                       return -EINVAL;
+
+               *(u32 *)(status_page + (inv_desc.hi_word & ~PAGE_MASK)) =
+                       inv_desc.lo_word >> 32;
+
+               return 0;
+       }
+       return -EINVAL;
+}
+
+static int vtd_unit_access_handler(unsigned int unit_no, bool is_write,
+                                  unsigned int reg, u32 *value)
+{
+       struct vtd_emulation *unit = &root_cell_units[unit_no];
+       struct vtd_entry inv_desc;
+       void *inv_desc_page;
+
+       if (reg == VTD_FSTS_REG && !is_write) {
+               /*
+                * Nothing to report this way, vtd_check_pending_faults takes
+                * care for the whole system.
+                */
+               *value = 0;
+               return 1;
+       }
+       if (reg == VTD_IQT_REG && is_write) {
+               while (unit->iqh != (*value & ~PAGE_MASK)) {
+                       inv_desc_page =
+                               page_map_get_guest_pages(NULL, unit->iqa, 1,
+                                                        PAGE_READONLY_FLAGS);
+                       if (!inv_desc_page)
+                               goto invalid_iq_entry;
+
+                       inv_desc =
+                           *(struct vtd_entry *)(inv_desc_page + unit->iqh);
+
+                       if (vtd_emulate_qi_request(unit_no, inv_desc) != 0)
+                               goto invalid_iq_entry;
+
+                       unit->iqh += 1 << VTD_IQH_QH_SHIFT;
+                       unit->iqh &= ~PAGE_MASK;
+               }
+               return 1;
+       }
+       panic_printk("FATAL: Unhandled DMAR unit %s access, register %02x\n",
+                    is_write ? "write" : "read", reg);
+       return -1;
+
+invalid_iq_entry:
+       panic_printk("FATAL: Invalid/unsupported invalidation queue entry\n");
+       return -1;
+}
+
+int vtd_mmio_access_handler(bool is_write, u64 addr, u32 *value)
+{
+       unsigned int n;
+       u64 base_addr;
+
+       if (!this_cell()->vtd.ir_emulation)
+               return 0;
+
+       for (n = 0; n < dmar_units; n++) {
+               base_addr = system_config->platform_info.x86.dmar_unit_base[n];
+               if (addr >= base_addr && addr < base_addr + PAGE_SIZE)
+                       return vtd_unit_access_handler(n, is_write,
+                                                      addr - base_addr,
+                                                      value);
+       }
+       return 0;
+}
+
 static void vtd_init_unit(void *reg_base, void *inv_queue)
 {
        void *fault_reg_base;
        unsigned int nfr, n;
 
+       /* Disabled QI and IR in case it was already on */
+       vtd_update_gcmd_reg(reg_base, VTD_GCMD_QIE, 0);
+       vtd_update_gcmd_reg(reg_base, VTD_GCMD_IRE, 0);
+
        nfr = mmio_read64_field(reg_base + VTD_CAP_REG, VTD_CAP_NFR_MASK);
        fault_reg_base = vtd_get_fault_rec_reg_addr(reg_base);
 
@@ -234,6 +380,7 @@ static void vtd_init_unit(void *reg_base, void *inv_queue)
        /* Set interrupt remapping table pointer */
        mmio_write64(reg_base + VTD_IRTA_REG,
                     page_map_hvirt2phys(int_remap_table) |
+                    (using_x2apic ? VTD_IRTA_EIME : 0) |
                     (int_remap_table_size_log2 - 1));
        vtd_update_gcmd_reg(reg_base, VTD_GCMD_SIRTP, 1);
 
@@ -250,9 +397,42 @@ static void vtd_init_unit(void *reg_base, void *inv_queue)
        vtd_update_gcmd_reg(reg_base, VTD_GCMD_IRE, 1);
 }
 
+static int vtd_init_ir_emulation(void *reg_base, unsigned int unit_no)
+{
+       struct vtd_emulation *unit = &root_cell_units[unit_no];
+       unsigned long size;
+       u64 iqt;
+
+       root_cell.vtd.ir_emulation = true;
+
+       unit->irta = mmio_read64(reg_base + VTD_IRTA_REG);
+       unit->irt_entries = 2 << (unit->irta & VTD_IRTA_SIZE_MASK);
+
+       size = PAGE_ALIGN(sizeof(struct vtd_irte_usage) * unit->irt_entries);
+       unit->irte_map = page_alloc(&mem_pool, size / PAGE_SIZE);
+       if (!unit->irte_map)
+               return -ENOMEM;
+
+       iqt = mmio_read64(reg_base + VTD_IQT_REG);
+       while (mmio_read64(reg_base + VTD_IQH_REG) != iqt)
+               cpu_relax();
+       unit->iqh = iqt;
+
+       unit->iqa = mmio_read64(reg_base + VTD_IQA_REG);
+       if (unit->iqa & ~VTD_IQA_ADDR_MASK)
+               return -EIO;
+
+       unit->fectl = mmio_read32(reg_base + VTD_FECTL_REG);
+       unit->fedata = mmio_read32(reg_base + VTD_FEDATA_REG);
+       unit->feaddr = mmio_read32(reg_base + VTD_FEADDR_REG);
+       unit->feuaddr = mmio_read32(reg_base + VTD_FEUADDR_REG);
+
+       return 0;
+}
+
 int vtd_init(void)
 {
-       unsigned long version, caps, ecaps, sllps_caps = ~0UL;
+       unsigned long version, caps, ecaps, ctrls, sllps_caps = ~0UL;
        unsigned int pt_levels, num_did, n;
        unsigned int units = 0;
        void *reg_base;
@@ -321,11 +501,22 @@ int vtd_init(void)
                        return -EIO;
 
                ecaps = mmio_read64(reg_base + VTD_ECAP_REG);
-               if (!(ecaps & VTD_ECAP_QI) || !(ecaps & VTD_ECAP_IR))
+               if (!(ecaps & VTD_ECAP_QI) || !(ecaps & VTD_ECAP_IR) ||
+                   (using_x2apic && !(ecaps & VTD_ECAP_EIM)))
                        return -EIO;
 
-               if (mmio_read32(reg_base + VTD_GSTS_REG) & VTD_GSTS_USED_CTRLS)
-                       return -EBUSY;
+               ctrls = mmio_read32(reg_base + VTD_GSTS_REG) &
+                       VTD_GSTS_USED_CTRLS;
+               if (ctrls != 0) {
+                       if (ctrls != (VTD_GSTS_IRES | VTD_GSTS_QIES))
+                               return -EBUSY;
+                       err = vtd_init_ir_emulation(reg_base, n);
+                       if (err)
+                               return err;
+               } else if (root_cell.vtd.ir_emulation) {
+                       /* IR+QI must be either on or off in all units */
+                       return -EIO;
+               }
 
                num_did = 1 << (4 + (caps & VTD_CAP_NUM_DID_MASK) * 2);
                if (num_did < dmar_num_did)
@@ -584,6 +775,57 @@ int vtd_unmap_memory_region(struct cell *cell,
                                mem->size, PAGE_MAP_COHERENT);
 }
 
+struct apic_irq_message
+vtd_get_remapped_root_int(unsigned int iommu, u16 device_id,
+                         unsigned int vector, unsigned int remap_index)
+{
+       struct vtd_emulation *unit = &root_cell_units[iommu];
+       struct apic_irq_message irq_msg = { .valid = 0 };
+       union vtd_irte root_irte;
+       unsigned long irte_addr;
+       void *irte_page;
+       int base_index;
+
+       if (remap_index >= unit->irt_entries)
+               return irq_msg;
+       unit->irte_map[remap_index].used = 0;
+
+       base_index = vtd_find_int_remap_region(device_id);
+       if (base_index < 0)
+               return irq_msg;
+
+       if (vector >= unit->irt_entries ||
+           base_index >= unit->irt_entries - vector)
+               return irq_msg;
+
+       irte_addr = (unit->irta & VTD_IRTA_ADDR_MASK) +
+               remap_index * sizeof(union vtd_irte);
+       irte_page = page_map_get_guest_pages(NULL, irte_addr, 1,
+                                            PAGE_READONLY_FLAGS);
+       if (!irte_page)
+               return irq_msg;
+
+       root_irte = *(union vtd_irte *)(irte_page + (irte_addr & ~PAGE_MASK));
+
+       irq_msg.valid =
+               (root_irte.field.p && root_irte.field.sid == device_id);
+       irq_msg.vector = root_irte.field.vector;
+       irq_msg.delivery_mode = root_irte.field.delivery_mode;
+       irq_msg.dest_logical = root_irte.field.dest_logical;
+       irq_msg.level_triggered = root_irte.field.level_triggered;
+       irq_msg.redir_hint = root_irte.field.redir_hint;
+       irq_msg.destination = root_irte.field.destination;
+       if (!using_x2apic)
+               /* xAPIC in flat mode: APIC ID in 47:40 (of 63:32) */
+               irq_msg.destination >>= 8;
+
+       unit->irte_map[remap_index].device_id = device_id;
+       unit->irte_map[remap_index].vector = vector;
+       unit->irte_map[remap_index].used = 1;
+
+       return irq_msg;
+}
+
 int vtd_map_interrupt(struct cell *cell, u16 device_id, unsigned int vector,
                      struct apic_irq_message irq_msg)
 {
@@ -607,19 +849,28 @@ int vtd_map_interrupt(struct cell *cell, u16 device_id, unsigned int vector,
        if (!irte.field.assigned || irte.field.sid != device_id)
                return -ERANGE;
 
+       irte.field.p = irq_msg.valid;
+       if (!irte.field.p)
+               /*
+                * Do not validate non-present entries, they may contain
+                * invalid data and cause false-positives.
+                */
+               goto update_irte;
+
        /*
         * Validate delivery mode and destination(s).
         * Note that we do support redirection hint only in logical
         * destination mode.
         */
-       // TODO: Support x2APIC cluster mode
        if ((irq_msg.delivery_mode != APIC_MSG_DLVR_FIXED &&
             irq_msg.delivery_mode != APIC_MSG_DLVR_LOWPRI) ||
-           irq_msg.dest_logical != irq_msg.redir_hint ||
-           (using_x2apic && irq_msg.dest_logical))
+           irq_msg.dest_logical != irq_msg.redir_hint)
                return -EINVAL;
        if (irq_msg.dest_logical) {
-               dest &= cell->cpu_set->bitmap[0];
+               if (using_x2apic)
+                       dest = x2apic_filter_logical_dest(cell, dest);
+               else
+                       dest &= cell->cpu_set->bitmap[0];
                /*
                 * Linux may have programmed inactive vectors with too broad
                 * destination masks. Silently adjust them when programming the
@@ -643,7 +894,8 @@ int vtd_map_interrupt(struct cell *cell, u16 device_id, unsigned int vector,
                irte.field.destination <<= 8;
        irte.field.sq = VTD_IRTE_SQ_VERIFY_FULL_SID;
        irte.field.svt = VTD_IRTE_SVT_VERIFY_SID_SQ;
-       irte.field.p = 1;
+
+update_irte:
        vtd_update_irte(base_index + vector, irte);
 
        return base_index + vector;
@@ -686,14 +938,54 @@ void vtd_config_commit(struct cell *cell_added_removed)
        }
 }
 
+static void vtd_restore_ir(unsigned int unit_no, void *reg_base)
+{
+       struct vtd_emulation *unit = &root_cell_units[unit_no];
+       void *inv_queue = unit_inv_queue + unit_no * PAGE_SIZE;
+       void *root_inv_queue;
+       u64 iqh;
+
+       mmio_write64(reg_base + VTD_IRTA_REG, unit->irta);
+       vtd_update_gcmd_reg(reg_base, VTD_GCMD_SIRTP, 1);
+       vtd_submit_iq_request(reg_base, inv_queue, &inv_global_int);
+
+       vtd_update_gcmd_reg(reg_base, VTD_GCMD_QIE, 0);
+       mmio_write64(reg_base + VTD_IQT_REG, 0);
+       mmio_write64(reg_base + VTD_IQA_REG, unit->iqa);
+       vtd_update_gcmd_reg(reg_base, VTD_GCMD_QIE, 1);
+
+       /*
+        * Restore invalidation queue head pointer by issuing dummy requests
+        * until the hardware is in sync with the Linux state again.
+        */
+       iqh =unit->iqh;
+       root_inv_queue = page_map_get_guest_pages(NULL, unit->iqa, 1,
+                                                 PAGE_DEFAULT_FLAGS);
+       if (root_inv_queue)
+               while (mmio_read64(reg_base + VTD_IQH_REG) != iqh)
+                       vtd_submit_iq_request(reg_base, root_inv_queue, NULL);
+       else
+               printk("WARNING: Failed to restore invalidation queue head\n");
+
+       vtd_update_gcmd_reg(reg_base, VTD_GCMD_IRE, 1);
+
+       mmio_write32(reg_base + VTD_FEDATA_REG, unit->fedata);
+       mmio_write32(reg_base + VTD_FEADDR_REG, unit->feaddr);
+       mmio_write32(reg_base + VTD_FEUADDR_REG, unit->feuaddr);
+       mmio_write32(reg_base + VTD_FECTL_REG, unit->fectl);
+}
+
 void vtd_shutdown(void)
 {
        void *reg_base = dmar_reg_base;
        unsigned int n;
 
        for (n = 0; n < dmar_units; n++, reg_base += PAGE_SIZE) {
-               vtd_update_gcmd_reg(reg_base, VTD_GCMD_IRE, 0);
                vtd_update_gcmd_reg(reg_base, VTD_GCMD_TE, 0);
-               vtd_update_gcmd_reg(reg_base, VTD_GCMD_QIE, 0);
+               vtd_update_gcmd_reg(reg_base, VTD_GCMD_IRE, 0);
+               if (root_cell.vtd.ir_emulation)
+                       vtd_restore_ir(n, reg_base);
+               else
+                       vtd_update_gcmd_reg(reg_base, VTD_GCMD_QIE, 0);
        }
 }