]> rtime.felk.cvut.cz Git - jailhouse.git/commitdiff
core: Virtualize MSI-X for interrupt remapping support
authorJan Kiszka <jan.kiszka@siemens.com>
Fri, 15 Aug 2014 13:04:53 +0000 (15:04 +0200)
committerJan Kiszka <jan.kiszka@siemens.com>
Tue, 26 Aug 2014 17:56:49 +0000 (19:56 +0200)
Similar to MSI support, this adds virtualization of MSI-X in order to
remap those interrupts via VT-d.

We have to intercept the MMIO access to the MSI-X tables of PCI devices
for this. Finding the corresponding device is done via a separate
per-cell list of all MSI-X capable PCI devices a cell contains. It is
built during cell creation, i.e. when devices are added, and it shrinks
again when devices are removed from a cell.

MSI-X device handover from/to Linux is simpler as we can centrally mask
all MSI-X vectors of a device.

As we may intercept more than the MSI-X table, accesses beyond it have
to be processed as well. Writes to the PBA are not allowed, reads from
anything after the table are simply performed on behalf of the cell.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
hypervisor/arch/x86/include/asm/cell.h
hypervisor/arch/x86/pci.c
hypervisor/arch/x86/vtd.c
hypervisor/include/jailhouse/pci.h
hypervisor/pci.c

index 1767c691aabd0e62e148b5bacff9cd67b9951e33..ba18d70673f550c4d02c266120d6dece25138c3f 100644 (file)
@@ -50,6 +50,7 @@ struct cell {
        struct cell *next;
 
        struct pci_device *pci_devices;
+       struct pci_device *msix_device_list;
        u32 pci_addr_port_val;
 
        u32 ioapic_index_reg_val;
index feab50ccdbe937963ebcd0639c977d49cec65b5d..393795d50fcee92a5d19ad1571fccd0f2c51672f 100644 (file)
@@ -12,6 +12,7 @@
  */
 
 #include <jailhouse/control.h>
+#include <jailhouse/mmio.h>
 #include <jailhouse/pci.h>
 #include <jailhouse/printk.h>
 #include <jailhouse/utils.h>
@@ -344,3 +345,36 @@ int pci_update_msi(struct pci_device *device,
 
        return 0;
 }
+
+int pci_update_msix_vector(struct pci_device *device, unsigned int index)
+{
+       union x86_msi_vector msi = {
+               .raw.address = device->msix_vectors[index].field.address,
+               .raw.data = device->msix_vectors[index].field.data,
+       };
+       struct apic_irq_message irq_msg;
+       int result;
+
+       if (!device->msix_registers.field.enable)
+               return 0;
+
+       irq_msg = pci_translate_msi_vector(device, index, 0, msi);
+       result = vtd_map_interrupt(device->cell, device->info->bdf, index,
+                                  irq_msg);
+       // HACK for QEMU
+       if (result == -ENOSYS) {
+               mmio_write64(&device->msix_table[index].field.address,
+                            device->msix_vectors[index].field.address);
+               mmio_write32(&device->msix_table[index].field.data,
+                            device->msix_vectors[index].field.data);
+               return 0;
+       }
+       if (result < 0)
+               return result;
+
+       mmio_write64(&device->msix_table[index].field.address,
+                    pci_get_x86_msi_remap_address(result));
+       mmio_write32(&device->msix_table[index].field.data, 0);
+
+       return 0;
+}
index 34570de27d484a926e94ffe19a63a084bd8bc061..070da9e0b4db480191f458725741e10ecf278fa7 100644 (file)
@@ -446,7 +446,8 @@ static void vtd_free_int_remap_region(u16 device_id, unsigned int length)
 
 int vtd_add_pci_device(struct cell *cell, struct pci_device *device)
 {
-       unsigned int max_vectors = device->info->num_msi_vectors;
+       unsigned int max_vectors = MAX(device->info->num_msi_vectors,
+                                      device->info->num_msix_vectors);
        u16 bdf = device->info->bdf;
        u64 *root_entry_lo = &root_entry_table[PCI_BUS(bdf)].lo_word;
        struct vtd_entry *context_entry_table, *context_entry;
@@ -505,7 +506,8 @@ void vtd_remove_pci_device(struct pci_device *device)
        context_entry->lo_word &= ~VTD_CTX_PRESENT;
        flush_cache(&context_entry->lo_word, sizeof(u64));
 
-       vtd_free_int_remap_region(bdf, device->info->num_msi_vectors);
+       vtd_free_int_remap_region(bdf, MAX(device->info->num_msi_vectors,
+                                          device->info->num_msi_vectors));
 
        for (n = 0; n < 256; n++)
                if (context_entry_table[n].lo_word & VTD_CTX_PRESENT)
index 31f54b986953d28abe33c3d4fc019a0ba0678617..22d64b79f1e956ff1afdbadd0a317e952e489268 100644 (file)
@@ -24,6 +24,8 @@
 # define PCI_CMD_MASTER                (1 << 2)
 # define PCI_CMD_INTX_OFF      (1 << 10)
 
+#define PCI_MAX_MSIX_VECTORS   16
+
 enum pci_access { PCI_ACCESS_REJECT, PCI_ACCESS_PERFORM, PCI_ACCESS_DONE };
 
 union pci_msi_registers {
@@ -44,11 +46,35 @@ union pci_msi_registers {
        u32 raw[4];
 } __attribute__((packed));
 
+union pci_msix_registers {
+       struct {
+               u16 padding;
+               u16 ignore:14,
+                   fmask:1,
+                   enable:1;
+       } __attribute__((packed)) field;
+       u32 raw;
+} __attribute__((packed));
+
+union pci_msix_vector {
+       struct {
+               u64 address;
+               u32 data;
+               u32 ctrl;
+       } __attribute__((packed)) field;
+       u32 raw[4];
+} __attribute__((packed));
+
 struct pci_device {
        const struct jailhouse_pci_device *info;
        struct cell *cell;
 
        union pci_msi_registers msi_registers;
+
+       union pci_msix_registers msix_registers;
+       struct pci_device *next_msix_device;
+       union pci_msix_vector *msix_table;
+       union pci_msix_vector msix_vectors[PCI_MAX_MSIX_VECTORS];
 };
 
 int pci_init(void);
@@ -77,6 +103,7 @@ void pci_suppress_msi(struct pci_device *device,
                      const struct jailhouse_pci_capability *cap);
 int pci_update_msi(struct pci_device *device,
                   const struct jailhouse_pci_capability *cap);
+int pci_update_msix_vector(struct pci_device *device, unsigned int index);
 
 void pci_prepare_handover(void);
 void pci_shutdown(void);
index 1ff8dc19232de7ba137d334e5a2a5187a9908fa7..322961693df816ad68dae61a16260b5a2bd3bbee 100644 (file)
@@ -22,6 +22,8 @@
 #define PCI_CAP_MSI                    0x05
 #define PCI_CAP_MSIX                   0x11
 
+#define MSIX_VECTOR_CTRL_DWORD         3
+
 #define for_each_configured_pci_device(dev, cell)                      \
        for ((dev) = (cell)->pci_devices;                               \
             (dev) - (cell)->pci_devices < (cell)->config->num_pci_devices; \
@@ -192,6 +194,20 @@ enum pci_access pci_cfg_read_moderate(struct pci_device *device, u16 address,
        return PCI_ACCESS_PERFORM;
 }
 
+static int pci_update_msix(struct pci_device *device,
+                          const struct jailhouse_pci_capability *cap)
+{
+       unsigned int n;
+       int result;
+
+       for (n = 0; n < device->info->num_msix_vectors; n++) {
+               result = pci_update_msix_vector(device, n);
+               if (result < 0)
+                       return result;
+       }
+       return 0;
+}
+
 /**
  * pci_cfg_write_moderate() - Moderate config space write access
  * @device:    The device to be accessed; if NULL, access will be rejected
@@ -208,8 +224,8 @@ enum pci_access pci_cfg_write_moderate(struct pci_device *device, u16 address,
        /* initialize list to work around wrong compiler warning */
        const struct pci_cfg_access *list = NULL;
        unsigned int bias_shift = (address % 4) * 8;
+       u32 mask = BYTE_MASK(size) << bias_shift;
        unsigned int n, cap_offs, len = 0;
-       u32 mask = BYTE_MASK(size);
 
        if (!device)
                return PCI_ACCESS_REJECT;
@@ -225,7 +241,7 @@ enum pci_access pci_cfg_write_moderate(struct pci_device *device, u16 address,
 
                for (n = 0; n < len; n++) {
                        if (list[n].reg_num == (address & 0xffc) &&
-                           ((list[n].mask >> bias_shift) & mask) == mask)
+                           (list[n].mask & mask) == mask)
                                return PCI_ACCESS_PERFORM;
                }
 
@@ -236,11 +252,11 @@ enum pci_access pci_cfg_write_moderate(struct pci_device *device, u16 address,
        if (!cap || !(cap->flags & JAILHOUSE_PCICAPS_WRITE))
                return PCI_ACCESS_REJECT;
 
+       value <<= bias_shift;
+
        cap_offs = address - cap->start;
        if (cap->id == PCI_CAP_MSI &&
            (cap_offs < 10 || (device->info->msi_64bits && cap_offs < 14))) {
-               value <<= bias_shift;
-               mask <<= bias_shift;
                device->msi_registers.raw[cap_offs / 4] &= ~mask;
                device->msi_registers.raw[cap_offs / 4] |= value;
 
@@ -253,6 +269,12 @@ enum pci_access pci_cfg_write_moderate(struct pci_device *device, u16 address,
                 */
                if (cap_offs >= 4)
                        return PCI_ACCESS_DONE;
+       } else if (cap->id == PCI_CAP_MSIX && cap_offs < 4) {
+               device->msix_registers.raw &= ~mask;
+               device->msix_registers.raw |= value;
+
+               if (pci_update_msix(device, cap) < 0)
+                       return PCI_ACCESS_REJECT;
        }
 
        return PCI_ACCESS_PERFORM;
@@ -290,6 +312,62 @@ int pci_init(void)
                               PAGE_MAP_NON_COHERENT);
 }
 
+static int pci_msix_access_handler(const struct cell *cell, bool is_write,
+                                  u64 addr, u32 *value)
+{
+       unsigned int dword = (addr % sizeof(union pci_msix_vector)) >> 2;
+       struct pci_device *device = cell->msix_device_list;
+       unsigned int index;
+       u64 offs;
+
+       while (device) {
+               if (addr >= device->info->msix_address &&
+                   addr < device->info->msix_address +
+                          device->info->msix_region_size)
+                       goto found;
+               device = device->next_msix_device;
+       }
+       return 0;
+
+found:
+       /* access must be DWORD-aligned */
+       if (addr & 0x3)
+               goto invalid_access;
+
+       offs = addr - device->info->msix_address;
+       index = offs / sizeof(union pci_msix_vector);
+
+       if (is_write) {
+               /*
+                * The PBA may share a page with the MSI-X table. Writing to
+                * PBA entries is undefined. We declare it as invalid.
+                */
+               if (index >= device->info->num_msix_vectors)
+                       goto invalid_access;
+               if (dword == MSIX_VECTOR_CTRL_DWORD) {
+                       mmio_write32(&device->msix_table[index].field.ctrl,
+                                    *value);
+               } else {
+                       device->msix_vectors[index].raw[dword] = *value;
+                       if (pci_update_msix_vector(device, index) < 0)
+                               goto invalid_access;
+               }
+       } else {
+               if (index >= device->info->num_msix_vectors ||
+                   dword == MSIX_VECTOR_CTRL_DWORD)
+                       *value =
+                           mmio_read32(((void *)device->msix_table) + offs);
+               else
+                       *value = device->msix_vectors[index].raw[dword];
+       }
+       return 1;
+
+invalid_access:
+       panic_printk("FATAL: Invalid PCI MSIX BAR write, device "
+                    "%02x:%02x.%x\n", PCI_BDF_PARAMS(device->info->bdf));
+       return -1;
+}
+
 /**
  * pci_mmio_access_handler() - Handler for MMIO-accesses to PCI config space
  * @cell:      Request issuing cell
@@ -307,7 +385,7 @@ int pci_mmio_access_handler(const struct cell *cell, bool is_write,
        enum pci_access access;
 
        if (!pci_space || addr < mmcfg_start || addr > mmcfg_end)
-               return 0;
+               return pci_msix_access_handler(cell, is_write, addr, value);
 
        mmcfg_offset = addr - mmcfg_start;
        reg_addr = mmcfg_offset & 0xfff;
@@ -365,6 +443,43 @@ static void pci_restore_msi(struct pci_device *device,
                                 device->msi_registers.raw[n], 4);
 }
 
+static void pci_suppress_msix(struct pci_device *device,
+                             const struct jailhouse_pci_capability *cap,
+                             bool suppressed)
+{
+       union pci_msix_registers regs = device->msix_registers;
+
+       if (suppressed)
+               regs.field.fmask = 1;
+       pci_write_config(device->info->bdf, cap->start, regs.raw, 4);
+}
+
+static void pci_save_msix(struct pci_device *device,
+                         const struct jailhouse_pci_capability *cap)
+{
+       unsigned int n, r;
+
+       device->msix_registers.raw =
+               pci_read_config(device->info->bdf, cap->start, 4);
+
+       for (n = 0; n < device->info->num_msix_vectors; n++)
+               for (r = 0; r < 3; r++)
+                       device->msix_vectors[n].raw[r] =
+                               mmio_read32(&device->msix_table[n].raw[r]);
+}
+
+static void pci_restore_msix(struct pci_device *device,
+                            const struct jailhouse_pci_capability *cap)
+{
+       unsigned int n, r;
+
+       for (n = 0; n < device->info->num_msix_vectors; n++)
+               for (r = 0; r < 3; r++)
+                       mmio_write32(&device->msix_table[n].raw[r],
+                                    device->msix_vectors[n].raw[r]);
+       pci_suppress_msix(device, cap, false);
+}
+
 /**
  * pci_prepare_handover() - Prepare the handover of PCI devices to Jailhouse or
  *                          back to Linux
@@ -383,24 +498,75 @@ void pci_prepare_handover(void)
                        for_each_pci_cap(cap, device, n)
                                if (cap->id == PCI_CAP_MSI)
                                        pci_suppress_msi(device, cap);
-                               // TODO: MSI-X
+                               else if (cap->id == PCI_CAP_MSIX)
+                                       pci_suppress_msix(device, cap, true);
        }
 }
 
 static int pci_add_device(struct cell *cell, struct pci_device *device)
 {
+       unsigned int size = device->info->msix_region_size;
+       int err;
+
        printk("Adding PCI device %02x:%02x.%x to cell \"%s\"\n",
               PCI_BDF_PARAMS(device->info->bdf), cell->config->name);
-       return arch_pci_add_device(cell, device);
+
+       err = arch_pci_add_device(cell, device);
+
+       if (!err && device->info->msix_address) {
+               device->msix_table = page_alloc(&remap_pool, size / PAGE_SIZE);
+               if (!device->msix_table) {
+                       err = -ENOMEM;
+                       goto error_remove_dev;
+               }
+
+               err = page_map_create(&hv_paging_structs,
+                                     device->info->msix_address, size,
+                                     (unsigned long)device->msix_table,
+                                     PAGE_DEFAULT_FLAGS | PAGE_FLAG_UNCACHED,
+                                     PAGE_MAP_NON_COHERENT);
+               if (err)
+                       goto error_page_free;
+
+               device->next_msix_device = cell->msix_device_list;
+               cell->msix_device_list = device;
+       }
+       return 0;
+
+error_page_free:
+       page_free(&remap_pool, device->msix_table, size / PAGE_SIZE);
+error_remove_dev:
+       arch_pci_remove_device(device);
+       return err;
 }
 
 static void pci_remove_device(struct pci_device *device)
 {
+       unsigned int size = device->info->msix_region_size;
+       struct pci_device *prev_msix_device;
+
        printk("Removing PCI device %02x:%02x.%x from cell \"%s\"\n",
               PCI_BDF_PARAMS(device->info->bdf), device->cell->config->name);
        arch_pci_remove_device(device);
        pci_write_config(device->info->bdf, PCI_CFG_COMMAND,
                         PCI_CMD_INTX_OFF, 2);
+
+       if (!device->msix_table)
+               return;
+
+       /* cannot fail, destruction of same size as construction */
+       page_map_destroy(&hv_paging_structs, (unsigned long)device->msix_table,
+                        size, PAGE_MAP_NON_COHERENT);
+       page_free(&remap_pool, device->msix_table, size / PAGE_SIZE);
+
+       prev_msix_device = device->cell->msix_device_list;
+       if (prev_msix_device == device) {
+               device->cell->msix_device_list = NULL;
+       } else {
+               while (prev_msix_device->next_msix_device != device)
+                       prev_msix_device = prev_msix_device->next_msix_device;
+               prev_msix_device->next_msix_device = NULL;
+       }
 }
 
 int pci_cell_init(struct cell *cell)
@@ -425,6 +591,11 @@ int pci_cell_init(struct cell *cell)
         * handy pointers. The cell pointer also encodes active ownership.
         */
        for (ndev = 0; ndev < cell->config->num_pci_devices; ndev++) {
+               if (dev_infos[ndev].num_msix_vectors > PCI_MAX_MSIX_VECTORS) {
+                       pci_cell_exit(cell);
+                       return -ERANGE;
+               }
+
                device = &cell->pci_devices[ndev];
                device->info = &dev_infos[ndev];
 
@@ -447,9 +618,7 @@ int pci_cell_init(struct cell *cell)
                        if (cap->id == PCI_CAP_MSI)
                                pci_save_msi(device, cap);
                        else if (cap->id == PCI_CAP_MSIX)
-                               // TODO: Handle
-                               printk("MSI-X left out @%02x:%02x.%x!\n",
-                                      PCI_BDF_PARAMS(device->info->bdf));
+                               pci_save_msix(device, cap);
        }
 
        if (cell == &root_cell)
@@ -509,9 +678,12 @@ void pci_config_commit(struct cell *cell_added_removed)
        for_each_configured_pci_device(device, &root_cell)
                if (device->cell)
                        for_each_pci_cap(cap, device, n) {
-                               if (cap->id == PCI_CAP_MSI)
+                               if (cap->id == PCI_CAP_MSI) {
                                        err = pci_update_msi(device, cap);
-                               // TODO: MSI-X
+                               } else if (cap->id == PCI_CAP_MSIX) {
+                                       err = pci_update_msix(device, cap);
+                                       pci_suppress_msix(device, cap, false);
+                               }
                                if (err)
                                        goto error;
                        }
@@ -537,5 +709,6 @@ void pci_shutdown(void)
                        for_each_pci_cap(cap, device, n)
                                if (cap->id == PCI_CAP_MSI)
                                        pci_restore_msi(device, cap);
-                               // TODO: MSI-X
+                               else if (cap->id == PCI_CAP_MSIX)
+                                       pci_restore_msix(device, cap);
 }