Merge remote-tracking branch 'kiszka/master'

[jailhouse.git] / hypervisor / arch / x86 / amd_iommu.c
diff --git a/hypervisor/arch/x86/amd_iommu.c b/hypervisor/arch/x86/amd_iommu.c

index 1a076fb334450655f1e7903fefb372bd20945e24..b3b8087eecbff30f7a8798d938a55bfe38b91c43 100644 (file)
--- a/hypervisor/arch/x86/amd_iommu.c
+++ b/hypervisor/arch/x86/amd_iommu.c
@@ -1,75 +1,701 @@
  /*
   * Jailhouse, a Linux-based partitioning hypervisor
   *
- * Copyright (c) Valentine Sinitsyn, 2014
+ * Copyright (c) Valentine Sinitsyn, 2014, 2015
+ * Copyright (c) Siemens AG, 2016
   *
   * Authors:
   *  Valentine Sinitsyn <valentine.sinitsyn@gmail.com>
+ *  Jan Kiszka <jan.kiszka@siemens.com>
+ *
+ * Commands posting and event log parsing code, as well as many defines
+ * were adapted from Linux's amd_iommu driver written by Joerg Roedel
+ * and Leo Duran.
   *
   * This work is licensed under the terms of the GNU GPL, version 2.  See
   * the COPYING file in the top-level directory.
   */
  
-#include <jailhouse/entry.h>
  #include <jailhouse/cell.h>
-#include <jailhouse/cell-config.h>
+#include <jailhouse/control.h>
+#include <jailhouse/mmio.h>
  #include <jailhouse/pci.h>
  #include <jailhouse/printk.h>
-#include <jailhouse/types.h>
+#include <jailhouse/string.h>
+#include <asm/amd_iommu.h>
  #include <asm/apic.h>
  #include <asm/iommu.h>
-#include <asm/percpu.h>
  
+#define CAPS_IOMMU_HEADER_REG          0x00
+#define  CAPS_IOMMU_EFR_SUP            (1 << 27)
+#define CAPS_IOMMU_BASE_LOW_REG                0x04
+#define  CAPS_IOMMU_ENABLE             (1 << 0)
+#define CAPS_IOMMU_BASE_HI_REG         0x08
+
+#define ACPI_REPORTING_HE_SUP          (1 << 7)
+
+#define AMD_DEV_TABLE_BASE_REG         0x0000
+#define AMD_CMD_BUF_BASE_REG           0x0008
+#define AMD_EVT_LOG_BASE_REG           0x0010
+#define AMD_CONTROL_REG                        0x0018
+#define  AMD_CONTROL_IOMMU_EN          (1UL << 0)
+#define  AMD_CONTROL_EVT_LOG_EN                (1UL << 2)
+#define  AMD_CONTROL_EVT_INT_EN                (1UL << 3)
+#define  AMD_CONTROL_COMM_WAIT_INT_EN  (1UL << 4)
+#define  AMD_CONTROL_CMD_BUF_EN                (1UL << 12)
+#define  AMD_CONTROL_SMIF_EN           (1UL << 22)
+#define  AMD_CONTROL_SMIFLOG_EN                (1UL << 24)
+#define  AMD_CONTROL_SEG_EN_MASK       BIT_MASK(36, 34)
+#define  AMD_CONTROL_SEG_EN_SHIFT      34
+#define AMD_EXT_FEATURES_REG           0x0030
+#define  AMD_EXT_FEAT_HE_SUP           (1UL << 7)
+#define  AMD_EXT_FEAT_SMI_FSUP         (1UL << 16)
+#define  AMD_EXT_FEAT_SMI_FSUP_MASK    BIT_MASK(17, 16)
+#define  AMD_EXT_FEAT_SMI_FRC_MASK     BIT_MASK(20, 18)
+#define  AMD_EXT_FEAT_SMI_FRC_SHIFT    18
+#define  AMD_EXT_FEAT_SEG_SUP_MASK     BIT_MASK(39, 38)
+#define  AMD_EXT_FEAT_SEG_SUP_SHIFT    38
+#define AMD_HEV_UPPER_REG              0x0040
+#define AMD_HEV_LOWER_REG              0x0048
+#define AMD_HEV_STATUS_REG             0x0050
+#define  AMD_HEV_VALID                 (1UL << 1)
+#define  AMD_HEV_OVERFLOW              (1UL << 2)
+#define AMD_SMI_FILTER0_REG            0x0060
+#define  AMD_SMI_FILTER_VALID          (1UL << 16)
+#define  AMD_SMI_FILTER_LOCKED         (1UL << 17)
+#define AMD_DEV_TABLE_SEG1_REG         0x0100
+#define AMD_CMD_BUF_HEAD_REG           0x2000
+#define AMD_CMD_BUF_TAIL_REG           0x2008
+#define AMD_EVT_LOG_HEAD_REG           0x2010
+#define AMD_EVT_LOG_TAIL_REG           0x2018
+#define AMD_STATUS_REG                 0x2020
+# define AMD_STATUS_EVT_OVERFLOW       (1UL << 0)
+# define AMD_STATUS_EVT_LOG_INT                (1UL << 1)
+# define AMD_STATUS_EVT_LOG_RUN                (1UL << 3)
+
+struct dev_table_entry {
+       u64 raw64[4];
+} __attribute__((packed));
+
+#define DTE_VALID                      (1UL << 0)
+#define DTE_TRANSLATION_VALID          (1UL << 1)
+#define DTE_PAGING_MODE_4_LEVEL                (4UL << 9)
+#define DTE_IR                         (1UL << 61)
+#define DTE_IW                         (1UL << 62)
+
+#define DEV_TABLE_SEG_MAX              8
+#define DEV_TABLE_SIZE                 0x200000
+
+union buf_entry {
+       u32 raw32[4];
+       u64 raw64[2];
+       struct {
+               u32 pad0;
+               u32 pad1:28;
+               u32 type:4;
+       };
+} __attribute__((packed));
+
+#define CMD_COMPL_WAIT                 0x01
+# define CMD_COMPL_WAIT_STORE          (1 << 0)
+# define CMD_COMPL_WAIT_INT            (1 << 1)
+
+#define CMD_INV_DEVTAB_ENTRY           0x02
+
+#define CMD_INV_IOMMU_PAGES            0x03
+# define CMD_INV_IOMMU_PAGES_SIZE      (1 << 0)
+# define CMD_INV_IOMMU_PAGES_PDE       (1 << 1)
+
+#define EVENT_TYPE_ILL_DEV_TAB_ENTRY   0x01
+#define EVENT_TYPE_PAGE_TAB_HW_ERR     0x04
+#define EVENT_TYPE_ILL_CMD_ERR         0x05
+#define EVENT_TYPE_CMD_HW_ERR          0x06
+#define EVENT_TYPE_IOTLB_INV_TIMEOUT   0x07
+#define EVENT_TYPE_INV_PPR_REQ         0x09
+
+#define BUF_LEN_EXPONENT_SHIFT         56
+
+/* Allocate minimum space possible (4K or 256 entries) */
+#define BUF_SIZE(name, entry)          ((1UL << name##_LEN_EXPONENT) * \
+                                         sizeof(entry))
+
+#define CMD_BUF_LEN_EXPONENT           8
+#define EVT_LOG_LEN_EXPONENT           8
+
+#define CMD_BUF_SIZE                   BUF_SIZE(CMD_BUF, union buf_entry)
+#define EVT_LOG_SIZE                   BUF_SIZE(EVT_LOG, union buf_entry)
+
+#define BITS_PER_SHORT                 16
+
+#define AMD_IOMMU_MAX_PAGE_TABLE_LEVELS        4
+
+static struct amd_iommu {
+       int idx;
+       void *mmio_base;
+       /* Command Buffer, Event Log */
+       unsigned char *cmd_buf_base;
+       unsigned char *evt_log_base;
+       /* Device table */
+       void *devtable_segments[DEV_TABLE_SEG_MAX];
+       u8 dev_tbl_seg_sup;
+       u32 cmd_tail_ptr;
+       bool he_supported;
+} iommu_units[JAILHOUSE_MAX_IOMMU_UNITS];
+
+#define for_each_iommu(iommu) for (iommu = iommu_units; \
+                                  iommu < iommu_units + iommu_units_count; \
+                                  iommu++)
+
+static unsigned int iommu_units_count;
+
+/*
+ * Interrupt remapping is not emulated on AMD,
+ * thus we have no MMIO to intercept.
+ */
  unsigned int iommu_mmio_count_regions(struct cell *cell)
  {
         return 0;
  }
  
-int iommu_init(void)
+bool iommu_cell_emulates_ir(struct cell *cell)
  {
-       printk("WARNING: AMD IOMMU support is not implemented yet\n");
-       /* TODO: Implement */
+       return false;
+}
+
+static int amd_iommu_init_pci(struct amd_iommu *entry,
+                             struct jailhouse_iommu *iommu)
+{
+       u64 caps_header, hi, lo;
+
+       /* Check alignment */
+       if (iommu->size & (iommu->size - 1))
+               return trace_error(-EINVAL);
+
+       /* Check that EFR is supported */
+       caps_header = pci_read_config(iommu->amd_bdf, iommu->amd_base_cap, 4);
+       if (!(caps_header & CAPS_IOMMU_EFR_SUP))
+               return trace_error(-EIO);
+
+       lo = pci_read_config(iommu->amd_bdf,
+                            iommu->amd_base_cap + CAPS_IOMMU_BASE_LOW_REG, 4);
+       hi = pci_read_config(iommu->amd_bdf,
+                            iommu->amd_base_cap + CAPS_IOMMU_BASE_HI_REG, 4);
+
+       if (lo & CAPS_IOMMU_ENABLE &&
+           ((hi << 32) | lo) != (iommu->base | CAPS_IOMMU_ENABLE)) {
+               printk("FATAL: IOMMU %d config is locked in invalid state.\n",
+                      entry->idx);
+               return trace_error(-EPERM);
+       }
+
+       /* Should be configured by BIOS, but we want to be sure */
+       pci_write_config(iommu->amd_bdf,
+                        iommu->amd_base_cap + CAPS_IOMMU_BASE_HI_REG,
+                        (u32)(iommu->base >> 32), 4);
+       pci_write_config(iommu->amd_bdf,
+                        iommu->amd_base_cap + CAPS_IOMMU_BASE_LOW_REG,
+                        (u32)(iommu->base & 0xffffffff) | CAPS_IOMMU_ENABLE,
+                        4);
+
+       /* Allocate and map MMIO space */
+       entry->mmio_base = page_alloc(&remap_pool, PAGES(iommu->size));
+       if (!entry->mmio_base)
+               return -ENOMEM;
+
+       return paging_create(&hv_paging_structs, iommu->base, iommu->size,
+                            (unsigned long)entry->mmio_base,
+                            PAGE_DEFAULT_FLAGS | PAGE_FLAG_DEVICE,
+                            PAGING_NON_COHERENT);
+}
+
+static int amd_iommu_init_features(struct amd_iommu *entry,
+                                  struct jailhouse_iommu *iommu)
+{
+       u64 efr = mmio_read64(entry->mmio_base + AMD_EXT_FEATURES_REG);
+       unsigned char smi_filter_regcnt;
+       u64 val, ctrl_reg = 0, smi_freg = 0;
+       unsigned int n;
+       void *reg_base;
+
+       /*
+        * Require SMI Filter support. Enable and lock filter but
+        * mark all entries as invalid to disable SMI delivery.
+        */
+       if ((efr & AMD_EXT_FEAT_SMI_FSUP_MASK) != AMD_EXT_FEAT_SMI_FSUP)
+               return trace_error(-EIO);
+
+       /* Figure out if hardware events are supported. */
+       if (iommu->amd_features)
+               entry->he_supported =
+                       iommu->amd_features & ACPI_REPORTING_HE_SUP;
+       else
+               entry->he_supported = efr & AMD_EXT_FEAT_HE_SUP;
+
+       smi_filter_regcnt = (1 << (efr & AMD_EXT_FEAT_SMI_FRC_MASK) >>
+               AMD_EXT_FEAT_SMI_FRC_SHIFT);
+       for (n = 0; n < smi_filter_regcnt; n++) {
+               reg_base = entry->mmio_base + AMD_SMI_FILTER0_REG + (n << 3);
+               smi_freg = mmio_read64(reg_base);
+
+               if (!(smi_freg & AMD_SMI_FILTER_LOCKED)) {
+                       /*
+                        * Program unlocked register the way we need:
+                        * invalid and locked.
+                        */
+                       mmio_write64(reg_base, AMD_SMI_FILTER_LOCKED);
+               } else if (smi_freg & AMD_SMI_FILTER_VALID) {
+                       /*
+                        * The register is locked and programed
+                        * the way we don't want - error.
+                        */
+                       printk("ERROR: SMI Filter register %d is locked "
+                              "and can't be reprogrammed.\n"
+                              "Reboot and check no other component uses the "
+                              "IOMMU %d.\n", n, entry->idx);
+                       return trace_error(-EPERM);
+               }
+               /*
+                * The register is locked, but programmed
+                * the way we need - OK to go.
+                */
+       }
+
+       ctrl_reg |= (AMD_CONTROL_SMIF_EN | AMD_CONTROL_SMIFLOG_EN);
+
+       /* Enable maximum Device Table segmentation possible */
+       entry->dev_tbl_seg_sup = (efr & AMD_EXT_FEAT_SEG_SUP_MASK) >>
+               AMD_EXT_FEAT_SEG_SUP_SHIFT;
+       if (entry->dev_tbl_seg_sup) {
+               val = (u64)entry->dev_tbl_seg_sup << AMD_CONTROL_SEG_EN_SHIFT;
+               ctrl_reg |= val & AMD_CONTROL_SEG_EN_MASK;
+       }
+
+       mmio_write64(entry->mmio_base + AMD_CONTROL_REG, ctrl_reg);
+
         return 0;
  }
  
+static int amd_iommu_init_buffers(struct amd_iommu *entry,
+                                 struct jailhouse_iommu *iommu)
+{
+       /* Allocate and configure command buffer */
+       entry->cmd_buf_base = page_alloc(&mem_pool, PAGES(CMD_BUF_SIZE));
+       if (!entry->cmd_buf_base)
+               return -ENOMEM;
+
+       mmio_write64(entry->mmio_base + AMD_CMD_BUF_BASE_REG,
+                    paging_hvirt2phys(entry->cmd_buf_base) |
+                    ((u64)CMD_BUF_LEN_EXPONENT << BUF_LEN_EXPONENT_SHIFT));
+
+       entry->cmd_tail_ptr = 0;
+
+       /* Allocate and configure event log */
+       entry->evt_log_base = page_alloc(&mem_pool, PAGES(EVT_LOG_SIZE));
+       if (!entry->evt_log_base)
+               return -ENOMEM;
+
+       mmio_write64(entry->mmio_base + AMD_EVT_LOG_BASE_REG,
+                    paging_hvirt2phys(entry->evt_log_base) |
+                    ((u64)EVT_LOG_LEN_EXPONENT << BUF_LEN_EXPONENT_SHIFT));
+
+       return 0;
+}
+
+static void amd_iommu_enable_command_processing(struct amd_iommu *iommu)
+{
+       u64 ctrl_reg;
+
+       ctrl_reg = mmio_read64(iommu->mmio_base + AMD_CONTROL_REG);
+       ctrl_reg |= AMD_CONTROL_IOMMU_EN | AMD_CONTROL_CMD_BUF_EN |
+               AMD_CONTROL_EVT_LOG_EN | AMD_CONTROL_EVT_INT_EN;
+       mmio_write64(iommu->mmio_base + AMD_CONTROL_REG, ctrl_reg);
+}
+
+int iommu_init(void)
+{
+       struct jailhouse_iommu *iommu;
+       struct amd_iommu *entry;
+       unsigned int n;
+       int err;
+
+       iommu = &system_config->platform_info.x86.iommu_units[0];
+       for (n = 0; iommu->base && n < iommu_count_units(); iommu++, n++) {
+               entry = &iommu_units[iommu_units_count];
+
+               entry->idx = n;
+
+               /* Protect against accidental VT-d configs. */
+               if (!iommu->amd_bdf)
+                       return trace_error(-EINVAL);
+
+               printk("AMD IOMMU @0x%lx/0x%x\n", iommu->base, iommu->size);
+
+               /* Initialize PCI registers and MMIO space */
+               err = amd_iommu_init_pci(entry, iommu);
+               if (err)
+                       return err;
+
+               /* Setup IOMMU features */
+               err = amd_iommu_init_features(entry, iommu);
+               if (err)
+                       return err;
+
+               /* Initialize command buffer and event log */
+               err = amd_iommu_init_buffers(entry, iommu);
+               if (err)
+                       return err;
+
+               /* Enable the IOMMU */
+               amd_iommu_enable_command_processing(entry);
+
+               iommu_units_count++;
+       }
+
+       return iommu_cell_init(&root_cell);
+}
+
  int iommu_cell_init(struct cell *cell)
  {
-       /* TODO: Implement */
+       // HACK for QEMU
+       if (iommu_units_count == 0)
+               return 0;
+
+       if (cell->id > 0xffff)
+               return trace_error(-ERANGE);
+
         return 0;
  }
  
+static void amd_iommu_completion_wait(struct amd_iommu *iommu);
+
+static void amd_iommu_submit_command(struct amd_iommu *iommu,
+                                    union buf_entry *cmd, bool draining)
+{
+       u32 head, next_tail, bytes_free;
+       unsigned char *cur_ptr;
+
+       head = mmio_read64(iommu->mmio_base + AMD_CMD_BUF_HEAD_REG);
+       next_tail = (iommu->cmd_tail_ptr + sizeof(*cmd)) % CMD_BUF_SIZE;
+       bytes_free = (head - next_tail) % CMD_BUF_SIZE;
+
+       /* Leave space for COMPLETION_WAIT that drains the buffer. */
+       if (bytes_free < (2 * sizeof(*cmd)) && !draining)
+               /* Drain the buffer */
+               amd_iommu_completion_wait(iommu);
+
+       cur_ptr = &iommu->cmd_buf_base[iommu->cmd_tail_ptr];
+       memcpy(cur_ptr, cmd, sizeof(*cmd));
+
+       /* Just to be sure. */
+       arch_paging_flush_cpu_caches(cur_ptr, sizeof(*cmd));
+
+       iommu->cmd_tail_ptr =
+               (iommu->cmd_tail_ptr + sizeof(*cmd)) % CMD_BUF_SIZE;
+}
+
+u64 amd_iommu_get_memory_region_flags(const struct jailhouse_memory *mem)
+{
+       unsigned long flags = AMD_IOMMU_PTE_P;
+
+       if (!(mem->flags & JAILHOUSE_MEM_DMA))
+               return 0;
+
+       if (mem->flags & JAILHOUSE_MEM_READ)
+               flags |= AMD_IOMMU_PTE_IR;
+       if (mem->flags & JAILHOUSE_MEM_WRITE)
+               flags |= AMD_IOMMU_PTE_IW;
+
+       return flags;
+}
+
  int iommu_map_memory_region(struct cell *cell,
                             const struct jailhouse_memory *mem)
  {
-       /* TODO: Implement */
+       /*
+        * Check that the address is not outside the scope of the page tables.
+        * With 4 levels, we only support 48 address bits.
+        */
+       if (mem->virt_start & BIT_MASK(63, 48))
+               return trace_error(-E2BIG);
+
+       /* vcpu_map_memory_region already did the actual work. */
         return 0;
  }
+
  int iommu_unmap_memory_region(struct cell *cell,
                               const struct jailhouse_memory *mem)
  {
-       /* TODO: Implement */
+       /* vcpu_map_memory_region already did the actual work. */
         return 0;
  }
  
+static void amd_iommu_inv_dte(struct amd_iommu *iommu, u16 device_id)
+{
+       union buf_entry invalidate_dte = {{ 0 }};
+
+       invalidate_dte.raw32[0] = device_id;
+       invalidate_dte.type = CMD_INV_DEVTAB_ENTRY;
+
+       amd_iommu_submit_command(iommu, &invalidate_dte, false);
+}
+
+static struct dev_table_entry *get_dev_table_entry(struct amd_iommu *iommu,
+                                                  u16 bdf, bool allocate)
+{
+       struct dev_table_entry *devtable_seg;
+       u8 seg_idx, seg_shift;
+       u64 reg_base, reg_val;
+       unsigned int n;
+       u16 seg_mask;
+       u32 seg_size;
+
+       if (!iommu->dev_tbl_seg_sup) {
+               seg_mask = 0;
+               seg_idx = 0;
+               seg_size = DEV_TABLE_SIZE;
+       } else {
+               seg_shift = BITS_PER_SHORT - iommu->dev_tbl_seg_sup;
+               seg_mask = ~((1 << seg_shift) - 1);
+               seg_idx = (seg_mask & bdf) >> seg_shift;
+               seg_size = DEV_TABLE_SIZE / (1 << iommu->dev_tbl_seg_sup);
+       }
+
+       /*
+        * Device table segmentation is tricky in Jailhouse. As cells can
+        * "share" the IOMMU, we don't know maximum bdf in each segment
+        * because cells are initialized independently. Thus, we can't simply
+        * adjust segment sizes for our maximum bdfs.
+        *
+        * The next best things is to lazily allocate segments as we add
+        * device using maximum possible size for segments. In the worst case
+        * scenario, we waste around 2M chunk per IOMMU.
+        */
+       devtable_seg = iommu->devtable_segments[seg_idx];
+       if (!devtable_seg) {
+               /* If we are not permitted to allocate, just fail */
+               if (!allocate)
+                       return NULL;
+
+               devtable_seg = page_alloc(&mem_pool, PAGES(seg_size));
+               if (!devtable_seg)
+                       return NULL;
+               iommu->devtable_segments[seg_idx] = devtable_seg;
+
+               /*
+                * Initialize all entries to paging mode 0, IR & IW cleared so
+                * that DMA requests are blocked.
+                */
+               for (n = 0; n < seg_size / sizeof(struct dev_table_entry); n++)
+                       devtable_seg[n].raw64[0] =
+                               DTE_VALID | DTE_TRANSLATION_VALID;
+
+               if (!seg_idx)
+                       reg_base = AMD_DEV_TABLE_BASE_REG;
+               else
+                       reg_base = AMD_DEV_TABLE_SEG1_REG + (seg_idx - 1) * 8;
+
+               /* Size in Kbytes = (m + 1) * 4, see Sect 3.3.6 */
+               reg_val = paging_hvirt2phys(devtable_seg) |
+                       (seg_size / PAGE_SIZE - 1);
+               mmio_write64(iommu->mmio_base + reg_base, reg_val);
+       }
+
+       return &devtable_seg[bdf & ~seg_mask];
+}
+
  int iommu_add_pci_device(struct cell *cell, struct pci_device *device)
  {
-       /* TODO: Implement */
+       struct dev_table_entry *dte = NULL;
+       struct amd_iommu *iommu;
+       u16 bdf;
+
+       // HACK for QEMU
+       if (iommu_units_count == 0)
+               return 0;
+
+       if (device->info->type == JAILHOUSE_PCI_TYPE_IVSHMEM)
+               return 0;
+
+       if (device->info->iommu >= JAILHOUSE_MAX_IOMMU_UNITS)
+               return trace_error(-ERANGE);
+
+       iommu = &iommu_units[device->info->iommu];
+       bdf = device->info->bdf;
+
+       dte = get_dev_table_entry(iommu, bdf, true);
+       if (!dte)
+               return -ENOMEM;
+
+       memset(dte, 0, sizeof(*dte));
+
+       /* DomainID */
+       dte->raw64[1] = cell->id & 0xffff;
+
+       /* Translation information */
+       dte->raw64[0] = DTE_IR | DTE_IW |
+               paging_hvirt2phys(cell->arch.svm.npt_iommu_structs.root_table) |
+               DTE_PAGING_MODE_4_LEVEL | DTE_TRANSLATION_VALID | DTE_VALID;
+
+       /* TODO: Interrupt remapping. For now, just forward them unmapped. */
+
+       /* Flush caches, just to be sure. */
+       arch_paging_flush_cpu_caches(dte, sizeof(*dte));
+
+       amd_iommu_inv_dte(iommu, bdf);
+
         return 0;
  }
  
  void iommu_remove_pci_device(struct pci_device *device)
  {
-       /* TODO: Implement */
+       struct dev_table_entry *dte = NULL;
+       struct amd_iommu *iommu;
+       u16 bdf;
+
+       // HACK for QEMU
+       if (iommu_units_count == 0)
+               return;
+
+       if (device->info->type == JAILHOUSE_PCI_TYPE_IVSHMEM)
+               return;
+
+       iommu = &iommu_units[device->info->iommu];
+       bdf = device->info->bdf;
+
+       dte = get_dev_table_entry(iommu, bdf, false);
+       if (!dte)
+               return;
+
+       /*
+        * Set Mode to 0 (translation disabled) and clear IR and IW to block
+        * DMA requests until the entry is reprogrammed for its new owner.
+        */
+       dte->raw64[0] = DTE_VALID | DTE_TRANSLATION_VALID;
+
+       /* Flush caches, just to be sure. */
+       arch_paging_flush_cpu_caches(dte, sizeof(*dte));
+
+       amd_iommu_inv_dte(iommu, bdf);
  }
  
  void iommu_cell_exit(struct cell *cell)
  {
-       /* TODO: Implement */
+}
+
+static void wait_for_zero(volatile u64 *sem, unsigned long mask)
+{
+       while (*sem & mask)
+               cpu_relax();
+}
+
+static void amd_iommu_invalidate_pages(struct amd_iommu *iommu,
+                                      u16 domain_id)
+{
+       union buf_entry invalidate_pages = {{ 0 }};
+
+       /*
+        * Flush everything, including PDEs, in whole address range, i.e.
+        * 0x7ffffffffffff000 with S bit (see Sect. 2.2.3).
+        */
+       invalidate_pages.raw32[1] = domain_id;
+       invalidate_pages.raw32[2] = 0xfffff000 | CMD_INV_IOMMU_PAGES_SIZE |
+               CMD_INV_IOMMU_PAGES_PDE;
+       invalidate_pages.raw32[3] = 0x7fffffff;
+       invalidate_pages.type = CMD_INV_IOMMU_PAGES;
+
+       amd_iommu_submit_command(iommu, &invalidate_pages, false);
+}
+
+static void amd_iommu_completion_wait(struct amd_iommu *iommu)
+{
+       union buf_entry completion_wait = {{ 0 }};
+       volatile u64 sem = 1;
+       long addr;
+
+       addr = paging_hvirt2phys(&sem);
+
+       completion_wait.raw32[0] = (addr & BIT_MASK(31, 3)) |
+               CMD_COMPL_WAIT_STORE;
+       completion_wait.raw32[1] = (addr & BIT_MASK(51, 32)) >> 32;
+       completion_wait.type = CMD_COMPL_WAIT;
+
+       amd_iommu_submit_command(iommu, &completion_wait, true);
+       mmio_write64(iommu->mmio_base + AMD_CMD_BUF_TAIL_REG,
+                    iommu->cmd_tail_ptr);
+
+       wait_for_zero(&sem, -1);
+}
+
+static void amd_iommu_init_fault_nmi(void)
+{
+       union x86_msi_vector msi_vec = {{ 0 }};
+       union pci_msi_registers msi_reg;
+       struct per_cpu *cpu_data;
+       struct amd_iommu *iommu;
+       int n;
+
+       cpu_data = iommu_select_fault_reporting_cpu();
+
+       /* Send NMI to fault reporting CPU */
+       msi_vec.native.address = MSI_ADDRESS_VALUE;
+       msi_vec.native.destination = cpu_data->apic_id;
+
+       msi_reg.msg32.enable = 1;
+       msi_reg.msg64.address = msi_vec.raw.address;
+       msi_reg.msg64.data = MSI_DM_NMI;
+
+       for_each_iommu(iommu) {
+               struct jailhouse_iommu *cfg =
+                   &system_config->platform_info.x86.iommu_units[iommu->idx];
+
+               /* Disable MSI during interrupt reprogramming. */
+               pci_write_config(cfg->amd_bdf, cfg->amd_msi_cap + 2 , 0, 2);
+
+               /*
+                * Write new MSI capability block, re-enabling interrupts with
+                * the last word.
+                */
+               for (n = 3; n >= 0; n--)
+                       pci_write_config(cfg->amd_bdf, cfg->amd_msi_cap + 4 * n,
+                                        msi_reg.raw[n], 4);
+       }
+
+       /*
+        * There is a race window in between we change fault_reporting_cpu_id
+        * and actually reprogram the MSI. To prevent event loss, signal an
+        * interrupt when done, so iommu_check_pending_faults() is called
+        * upon completion even if no further NMIs due to events would occurr.
+        *
+        * Note we can't simply use CMD_COMPL_WAIT_INT_MASK in
+        * amd_iommu_completion_wait(), as it seems that IOMMU either signal
+        * an interrupt or do memory write, but not both.
+        */
+       apic_send_nmi_ipi(cpu_data);
  }
  
  void iommu_config_commit(struct cell *cell_added_removed)
  {
-       /* TODO: Implement */
+       struct amd_iommu *iommu;
+
+       // HACK for QEMU
+       if (iommu_units_count == 0)
+               return;
+
+       /* Ensure we'll get NMI on completion, or if anything goes wrong. */
+       if (cell_added_removed)
+               amd_iommu_init_fault_nmi();
+
+       for_each_iommu(iommu) {
+               /* Flush caches */
+               if (cell_added_removed) {
+                       amd_iommu_invalidate_pages(iommu,
+                                       cell_added_removed->id & 0xffff);
+                       amd_iommu_invalidate_pages(iommu,
+                                       root_cell.id & 0xffff);
+               }
+               /* Execute all commands in the buffer */
+               amd_iommu_completion_wait(iommu);
+       }
  }
  
  struct apic_irq_message iommu_get_remapped_root_int(unsigned int iommu,
@@ -92,16 +718,125 @@ int iommu_map_interrupt(struct cell *cell, u16 device_id, unsigned int vector,
  
  void iommu_shutdown(void)
  {
-       /* TODO: Implement */
+       struct amd_iommu *iommu;
+       u64 ctrl_reg;
+
+       for_each_iommu(iommu) {
+               /* Disable the IOMMU */
+               ctrl_reg = mmio_read64(iommu->mmio_base + AMD_CONTROL_REG);
+               ctrl_reg &= ~(AMD_CONTROL_IOMMU_EN | AMD_CONTROL_CMD_BUF_EN |
+                       AMD_CONTROL_EVT_LOG_EN | AMD_CONTROL_EVT_INT_EN);
+               mmio_write64(iommu->mmio_base + AMD_CONTROL_REG, ctrl_reg);
+       }
  }
  
-void iommu_check_pending_faults(void)
+static void amd_iommu_print_event(struct amd_iommu *iommu,
+                                 union buf_entry *entry)
  {
-       /* TODO: Implement */
+       printk("AMD IOMMU %d reported event\n", iommu->idx);
+       printk(" EventCode: %lx, Operand 1: %lx, Operand 2: %lx\n",
+              entry->type, entry->raw64[0], entry->raw64[1]);
+       switch (entry->type) {
+               case EVENT_TYPE_ILL_DEV_TAB_ENTRY...EVENT_TYPE_PAGE_TAB_HW_ERR:
+               case EVENT_TYPE_IOTLB_INV_TIMEOUT...EVENT_TYPE_INV_PPR_REQ:
+                       printk(" DeviceId (bus:dev.func): %02x:%02x.%x\n",
+                              PCI_BDF_PARAMS(entry->raw32[0] & 0xffff));
+                       break;
+               case EVENT_TYPE_ILL_CMD_ERR:
+               case EVENT_TYPE_CMD_HW_ERR:
+                       panic_printk("FATAL: IOMMU %d command error\n");
+                       panic_stop();
+       }
  }
  
-bool iommu_cell_emulates_ir(struct cell *cell)
+static void amd_iommu_restart_event_log(struct amd_iommu *iommu)
  {
-       /* TODO: Implement */
-       return false;
+       void *base = iommu->mmio_base;
+
+       wait_for_zero(base + AMD_STATUS_REG, AMD_STATUS_EVT_LOG_RUN);
+
+       mmio_write64_field(base + AMD_CONTROL_REG, AMD_CONTROL_EVT_LOG_EN, 0);
+
+       /* Simply start from the scratch */
+       mmio_write64(base + AMD_EVT_LOG_HEAD_REG, 0);
+       mmio_write64(base + AMD_EVT_LOG_TAIL_REG, 0);
+
+       /* Clear EventOverflow (RW1C) */
+       mmio_write64_field(base + AMD_STATUS_REG, AMD_STATUS_EVT_OVERFLOW, 1);
+
+       /* Bring logging back */
+       mmio_write64_field(base + AMD_CONTROL_REG, AMD_CONTROL_EVT_LOG_EN, 1);
+}
+
+static void amd_iommu_poll_events(struct amd_iommu *iommu)
+{
+       union buf_entry *evt;
+       u32 head, tail;
+       u64 status;
+
+       status = mmio_read64(iommu->mmio_base + AMD_STATUS_REG);
+
+       if (status & AMD_STATUS_EVT_OVERFLOW) {
+               printk("IOMMU %d: Event Log overflow occurred, "
+                      "some events were lost!\n", iommu->idx);
+               amd_iommu_restart_event_log(iommu);
+       }
+
+       while (status & AMD_STATUS_EVT_LOG_INT) {
+               /* Clear EventLogInt (RW1C) */
+               mmio_write64_field(iommu->mmio_base + AMD_STATUS_REG,
+                                  AMD_STATUS_EVT_LOG_INT, 1);
+
+               head = mmio_read32(iommu->mmio_base + AMD_EVT_LOG_HEAD_REG);
+               tail = mmio_read32(iommu->mmio_base + AMD_EVT_LOG_TAIL_REG);
+
+               while (head != tail) {
+                       evt = (union buf_entry *)(iommu->evt_log_base + head);
+                       amd_iommu_print_event(iommu, evt);
+                       head = (head + sizeof(*evt)) % EVT_LOG_SIZE;
+               }
+
+               mmio_write32(iommu->evt_log_base + AMD_EVT_LOG_HEAD_REG, head);
+
+               /* Re-read status to catch new events, as Linux does */
+               status = mmio_read64(iommu->mmio_base + AMD_STATUS_REG);
+       }
+}
+
+static void amd_iommu_handle_hardware_event(struct amd_iommu *iommu)
+{
+       union buf_entry hev_entry;
+       u64 hev;
+
+       hev = mmio_read64(iommu->mmio_base + AMD_HEV_STATUS_REG);
+
+       /* Check if hardware event is present and print it */
+       if (hev & AMD_HEV_VALID) {
+               if (hev & AMD_HEV_OVERFLOW)
+                       printk("IOMMU %d: Hardware Event Overflow occurred, "
+                              "some events were lost!\n", iommu->idx);
+               hev_entry.raw64[0] =
+                       mmio_read64(iommu->mmio_base + AMD_HEV_UPPER_REG);
+               hev_entry.raw64[1] =
+                       mmio_read64(iommu->mmio_base + AMD_HEV_LOWER_REG);
+
+               amd_iommu_print_event(iommu, &hev_entry);
+
+               /* Clear Hardware Event */
+               mmio_write64(iommu->mmio_base + AMD_HEV_STATUS_REG, 0);
+       }
+}
+
+void iommu_check_pending_faults(void)
+{
+       struct amd_iommu *iommu;
+
+       if (this_cpu_data()->cpu_id != fault_reporting_cpu_id)
+               return;
+
+       for_each_iommu(iommu) {
+               if (iommu->he_supported)
+                       amd_iommu_handle_hardware_event(iommu);
+               amd_iommu_poll_events(iommu);
+       }
  }