#
# Jailhouse, a Linux-based partitioning hypervisor
#
-# Copyright (c) Siemens AG, 2014
+# Copyright (c) Siemens AG, 2014-2016
+# Copyright (c) Valentine Sinitsyn, 2014-2015
+#
+# Authors:
+# Henning Schild <henning.schild@siemens.com>
+# Jan Kiszka <jan.kiszka@siemens.com>
+# Valentine Sinitsyn <valentine.sinitsyn@gmail.com>
#
# This work is licensed under the terms of the GNU GPL, version 2. See
# the COPYING file in the top-level directory.
options = parser.parse_args()
-inputs = {'files': set(), 'files_opt': set(), 'files_intel': set()}
+inputs = {
+ 'files': set(),
+ 'files_opt': set(),
+ 'files_intel': set(),
+ 'files_amd': set()
+}
# required files
inputs['files'].add('/proc/iomem')
inputs['files'].add('/proc/cmdline')
inputs['files'].add('/proc/ioports')
inputs['files'].add('/sys/bus/pci/devices/*/config')
+inputs['files'].add('/sys/bus/pci/devices/*/resource')
inputs['files'].add('/sys/devices/system/cpu/cpu*/uevent')
inputs['files'].add('/sys/firmware/acpi/tables/APIC')
inputs['files'].add('/sys/firmware/acpi/tables/MCFG')
inputs['files_opt'].add('/sys/devices/jailhouse/enabled')
# platform specific files
inputs['files_intel'].add('/sys/firmware/acpi/tables/DMAR')
+inputs['files_amd'].add('/sys/firmware/acpi/tables/IVRS')
def kmg_multiply(value, kmg):
global cpuvendor
if cpuvendor == 'GenuineIntel':
set = set.union(inputs['files_intel'])
+ elif cpuvendor == 'AuthenticAMD':
+ set = set.union(inputs['files_amd'])
for file in set:
if fnmatch.fnmatch(name, file):
return dirs
+class PCIBARs:
+ IORESOURCE_IO = 0x00000100
+ IORESOURCE_MEM = 0x00000200
+ IORESOURCE_MEM_64 = 0x00100000
+
+ def __init__(self, dir):
+ self.mask = []
+ f = input_open(os.path.join(dir, 'resource'), 'r')
+ for n in range(6):
+ (start, end, flags) = f.readline().split()
+ flags = int(flags, 16)
+ if flags & PCIBARs.IORESOURCE_IO:
+ mask = ~(int(end, 16) - int(start, 16))
+ elif flags & PCIBARs.IORESOURCE_MEM:
+ mask = ~(int(end, 16) - int(start, 16))
+ if flags & PCIBARs.IORESOURCE_MEM_64:
+ self.mask.append(mask & 0xffffffff)
+ mask >>= 32
+ n += 1
+ else:
+ mask = 0
+ self.mask.append(mask & 0xffffffff)
+ f.close()
+
+
class PCICapability:
def __init__(self, id, start, len, flags, content, msix_address):
self.id = id
RD = '0'
RW = 'JAILHOUSE_PCICAPS_WRITE'
+ JAILHOUSE_PCI_EXT_CAP = 0x8000
+
@staticmethod
def parse_pcicaps(dir):
caps = []
+ has_extended_caps = False
f = input_open(os.path.join(dir, 'config'), 'rb')
f.seek(0x06)
(status,) = struct.unpack('<H', f.read(2))
if (msgctl & (1 << 8)) != 0: # per-vector masking support
len += 10
flags = PCICapability.RW
+ elif id == 0x10: # Express
+ len = 20
+ (cap_reg,) = struct.unpack('<H', f.read(2))
+ if (cap_reg & 0xf) >= 2: # v2 capability
+ len = 44
+ # access side effects still need to be analyzed
+ flags = PCICapability.RD
+ has_extended_caps = True
elif id == 0x11: # MSI-X
# access will be moderated by hypervisor
len = 12
content = f.read(len - 2)
caps.append(PCICapability(id, cap, len, flags, content,
msix_address))
+
+ if has_extended_caps:
+ # walk extended capability list
+ next = 0x100
+ while next != 0:
+ cap = next
+ f.seek(cap)
+ (id, version_next) = struct.unpack('<HH', f.read(4))
+ next = version_next >> 4
+ if id == 0xffff:
+ break
+ elif id == 0x0010: # SR-IOV
+ len = 64
+ # access side effects still need to be analyzed
+ flags = PCICapability.RD
+ else:
+ if (id & PCICapability.JAILHOUSE_PCI_EXT_CAP) != 0:
+ print('WARNING: Ignoring unsupported PCI Express '
+ 'Extended Capability ID %x' % id)
+ continue
+ # unknown/unhandled cap, mark its existence
+ len = 4
+ flags = PCICapability.RD
+ f.seek(cap + 4)
+ content = f.read(len - 4)
+ id |= PCICapability.JAILHOUSE_PCI_EXT_CAP
+ caps.append(PCICapability(id, cap, len, flags, content, 0))
+
+ f.close()
return caps
class PCIDevice:
- def __init__(self, type, domain, bus, dev, fn, caps):
+ def __init__(self, type, domain, bus, dev, fn, bars, caps, path):
self.type = type
self.iommu = None
self.domain = domain
self.bus = bus
self.dev = dev
self.fn = fn
+ self.bars = bars
self.caps = caps
+ self.path = path
self.caps_start = 0
self.num_caps = len(caps)
self.num_msi_vectors = 0
self.num_msi_vectors = 1 << ((msg_ctrl >> 1) & 0x7)
self.msi_64bits = (msg_ctrl >> 7) & 1
else: # MSI-X
- vectors = (msg_ctrl & 0x7ff) + 1
- self.num_msix_vectors = vectors
- self.msix_region_size = (vectors * 16 + 0xfff) & 0xf000
- self.msix_address = c.msix_address
+ if c.msix_address != 0:
+ vectors = (msg_ctrl & 0x7ff) + 1
+ self.num_msix_vectors = vectors
+ self.msix_region_size = (vectors * 16 + 0xfff) & 0xf000
+ self.msix_address = c.msix_address
+ else:
+ print('WARNING: Ignoring invalid MSI-X configuration'
+ ' of device %02x:%02x.%x' % (bus, dev, fn))
def __str__(self):
return 'PCIDevice: %02x:%02x.%x' % (self.bus, self.dev, self.fn)
def parse_pcidevice_sysfsdir(basedir, dir):
dpath = os.path.join(basedir, dir)
f = input_open(os.path.join(dpath, 'config'), 'rb')
+ (vendor_device,) = struct.unpack('<I', f.read(4))
+ if vendor_device == 0xffffffff:
+ print('WARNING: Ignoring apparently disabled PCI device %s' % dir)
+ return None
f.seek(0x0A)
(classcode,) = struct.unpack('<H', f.read(2))
f.close()
domain = int(a[0], 16)
bus = int(a[1], 16)
df = a[2].split('.')
+ bars = PCIBARs(dpath)
caps = PCICapability.parse_pcicaps(dpath)
return PCIDevice(type, domain, bus, int(df[0], 16), int(df[1], 16),
- caps)
+ bars, caps, dpath)
+
+
+class PCIPCIBridge(PCIDevice):
+ @staticmethod
+ def get_2nd_busses(dev):
+ assert dev.type == 'JAILHOUSE_PCI_TYPE_BRIDGE'
+ f = input_open(os.path.join(dev.path, 'config'), 'rb')
+ f.seek(0x19)
+ (secondbus, subordinate) = struct.unpack('<BB', f.read(2))
+ f.close()
+ return (secondbus, subordinate)
class MemRegion:
self.start = start
self.stop = stop
self.typestr = typestr
- if comments is None:
- self.comments = []
- else:
- self.comments = comments
+ self.comments = comments or []
def __str__(self):
return 'MemRegion: %08x-%08x : %s' % \
self.typestr == 'System RAM' or
self.typestr == 'Kernel' or
self.typestr == 'RAM buffer' or
- self.typestr == 'ACPI DMAR RMRR'
+ self.typestr == 'ACPI DMAR RMRR' or
+ self.typestr == 'ACPI IVRS'
):
s = 'JAILHOUSE_MEM_READ | JAILHOUSE_MEM_WRITE |\n'
s += p + '\t\tJAILHOUSE_MEM_EXECUTE | JAILHOUSE_MEM_DMA'
# blacklisted on all levels
if (
(s.find('PCI MMCONFIG') >= 0) or
- (s.find('APIC') >= 0) or # covers both APIC and IOAPIC
- (s.find('dmar') >= 0)
+ (s.find('APIC') >= 0) # covers both APIC and IOAPIC
):
continue
return regions
+class IOMMUConfig(object):
+ def __init__(self, props):
+ self.base_addr = props['base_addr']
+ self.mmio_size = props['mmio_size']
+ if 'amd_bdf' in props:
+ self.amd_bdf = props['amd_bdf']
+ self.amd_base_cap = props['amd_base_cap']
+ self.amd_msi_cap = props['amd_msi_cap']
+ self.amd_features = props['amd_features']
+
+ @property
+ def is_amd_iommu(self):
+ return hasattr(self, 'amd_bdf')
+
+
def parse_iomem(pcidevices):
regions = IOMemRegionTree.parse_iomem_tree(
IOMemRegionTree.parse_iomem_file())
- # filter the list for MSI-X pages
+ rom_region = MemRegion(0xc0000, 0xdffff, 'ROMs')
+ add_rom_region = False
+
ret = []
+ dmar_regions = []
for r in regions:
+ append_r = True
+ # filter the list for MSI-X pages
for d in pcidevices:
if d.msix_address >= r.start and d.msix_address <= r.stop:
if d.msix_address > r.start:
tail_r = MemRegion(d.msix_address + d.msix_region_size,
r.stop, r.typestr, r.comments)
ret.append(tail_r)
- r = None
+ append_r = False
break
- if r:
+ # filter out the ROMs
+ if (r.start >= rom_region.start and r.stop <= rom_region.stop):
+ add_rom_region = True
+ append_r = False
+ # filter out and save DMAR regions
+ if r.typestr.find('dmar') >= 0:
+ dmar_regions.append(r)
+ append_r = False
+ if append_r:
ret.append(r)
+ # add a region that covers all potential ROMs
+ if add_rom_region:
+ ret.append(rom_region)
+
# newer Linux kernels will report the first page as reserved
# it is needed for CPU init so include it anyways
if (ret[0].typestr == 'System RAM' and ret[0].start == 0x1000):
ret[0].start = 0
- return ret
+ return ret, dmar_regions
def parse_pcidevices():
f.seek(struct_len - offset, os.SEEK_CUR)
+ f.close()
return ioapics
# parsing of DMAR ACPI Table
# see Intel VT-d Spec chapter 8
-def parse_dmar(pcidevices, ioapics):
+def parse_dmar(pcidevices, ioapics, dmar_regions):
f = input_open('/sys/firmware/acpi/tables/DMAR', 'rb')
signature = f.read(4)
if signature != b'DMAR':
raise RuntimeError('We do not support multiple PCI segments')
if len(units) >= 8:
raise RuntimeError('Too many DMAR units. '
- 'Raise JAILHOUSE_MAX_DMAR_UNITS.')
- units.append(base)
+ 'Raise JAILHOUSE_MAX_IOMMU_UNITS.')
+ size = 0
+ for r in dmar_regions:
+ if base == r.start:
+ size = r.size()
+ if size == 0:
+ raise RuntimeError('DMAR region size cannot be identified.\n'
+ 'Target Linux must run with Intel IOMMU '
+ 'enabled.')
+ if size > 0x3000:
+ raise RuntimeError('Unexpectedly large DMAR region.')
+ units.append(IOMMUConfig({
+ 'base_addr': base,
+ 'mmio_size': size
+ }))
if flags & 1:
for d in pcidevices:
if d.iommu is None:
parse_dmar_devscope(f)
# PCI Endpoint Device
if scope_type == 1:
+ assert not (flags & 1)
for d in pcidevices:
if d.bus == bus and d.dev == dev and d.fn == fn:
d.iommu = len(units) - 1
break
# PCI Sub-hierarchy
elif scope_type == 2:
- raise RuntimeError('Unsupported DMAR Device Scope type')
+ assert not (flags & 1)
+ for d in pcidevices:
+ if d.bus == bus and d.dev == dev and d.fn == fn:
+ (secondbus, subordinate) = \
+ PCIPCIBridge.get_2nd_busses(d)
+ for d2 in pcidevices:
+ if (
+ d2.bus >= secondbus and
+ d2.bus <= subordinate
+ ):
+ d2.iommu = len(units) - 1
+ break
# IOAPIC
elif scope_type == 3:
ioapic = next(chip for chip in ioapics if chip.id == id)
if chip.bdf == bdf:
raise RuntimeError('IOAPICs with identical BDF')
ioapic.bdf = bdf
- ioapic.dmar_unit = len(units) - 1
+ ioapic.iommu = len(units) - 1
offset += scope_len
# Reserved Memory Region Reporting Structure
f.seek(struct_len - offset, os.SEEK_CUR)
+ f.close()
+
+ for d in pcidevices:
+ if d.iommu is None:
+ raise RuntimeError(
+ 'PCI device %02x:%02x.%x outside the scope of an '
+ 'IOMMU' % (d.bus, d.dev, d.fn))
+
return units, regions
+def parse_ivrs(pcidevices, ioapics):
+ def format_bdf(bdf):
+ bus, dev, fun = (bdf >> 8) & 0xff, (bdf >> 3) & 0x1f, bdf & 0x7
+ return '%02x:%02x.%x' % (bus, dev, fun)
+
+ f = input_open('/sys/firmware/acpi/tables/IVRS', 'rb')
+ signature = f.read(4)
+ if signature != b'IVRS':
+ raise RuntimeError('IVRS: incorrect input file format %s' % signature)
+
+ (length, revision) = struct.unpack('<IB', f.read(5))
+ if revision > 2:
+ raise RuntimeError('IVRS: unsupported Revision %02x' % revision)
+
+ f.seek(48, os.SEEK_SET)
+ length -= 48
+
+ units = []
+ regions = []
+ # BDF of devices that are permitted outside IOMMU: root complex
+ iommu_skiplist = set([0x0])
+ ivhd_blocks = 0
+ while length > 0:
+ (block_type, block_length) = struct.unpack('<BxH', f.read(4))
+ if block_type in [0x10, 0x11]:
+ ivhd_blocks += 1
+ if ivhd_blocks > 1:
+ raise RuntimeError('Jailhouse doesn\'t support more than one '
+ 'AMD IOMMU per PCI function.')
+ # IVHD block
+ ivhd_fields = struct.unpack('<HHQHxxL', f.read(20))
+ (iommu_bdf, base_cap_ofs,
+ base_addr, pci_seg, iommu_feat) = ivhd_fields
+
+ length -= block_length
+ block_length -= 24
+
+ if pci_seg != 0:
+ raise RuntimeError('We do not support multiple PCI segments')
+
+ if len(units) > 8:
+ raise RuntimeError('Too many IOMMU units. '
+ 'Raise JAILHOUSE_MAX_IOMMU_UNITS.')
+
+ msi_cap_ofs = None
+
+ for i, d in enumerate(pcidevices):
+ if d.bdf() == iommu_bdf:
+ # Extract MSI capability offset
+ for c in d.caps:
+ if c.id == 0x05:
+ msi_cap_ofs = c.start
+ # We must not map IOMMU to the cells
+ del pcidevices[i]
+
+ if msi_cap_ofs is None:
+ raise RuntimeError('AMD IOMMU lacks MSI support, and '
+ 'Jailhouse doesn\'t support MSI-X yet.')
+
+ if (iommu_feat & (0xF << 13)) and (iommu_feat & (0x3F << 17)):
+ # Performance Counters are supported, allocate 512K
+ mmio_size = 524288
+ else:
+ # Allocate 16K
+ mmio_size = 16384
+
+ units.append(IOMMUConfig({
+ 'base_addr': base_addr,
+ 'mmio_size': mmio_size,
+ 'amd_bdf': iommu_bdf,
+ 'amd_base_cap': base_cap_ofs,
+ 'amd_msi_cap': msi_cap_ofs,
+ # IVHD block type 0x11 has exact EFR copy but type 0x10 may
+ # overwrite what hardware reports. Set reserved bit 0 in that
+ # case to indicate that the value is in use.
+ 'amd_features': (iommu_feat | 0x1) if block_type == 0x10 else 0
+ }))
+
+ bdf_start_range = None
+ while block_length > 0:
+ (entry_type, device_id) = struct.unpack('<BHx', f.read(4))
+ block_length -= 4
+
+ if entry_type == 0x01:
+ # All
+ for d in pcidevices:
+ d.iommu = len(units) - 1
+ elif entry_type == 0x02:
+ # Select
+ for d in pcidevices:
+ if d.bdf() == device_id:
+ d.iommu = len(units) - 1
+ elif entry_type == 0x03:
+ # Start of range
+ bdf_start_range = device_id
+ elif entry_type == 0x04:
+ # End of range
+ if bdf_start_range is None:
+ continue
+ for d in pcidevices:
+ if d.bdf() >= bdf_start_range and d.bdf() <= device_id:
+ d.iommu = len(units) - 1
+ bdf_start_range = None
+ elif entry_type == 0x42:
+ # Alias select
+ (device_id_b,) = struct.unpack('<xHx', f.read(4))
+ block_length -= 4
+ for d in pcidevices:
+ if d.bdf() == device_id_b:
+ d.iommu = len(units) - 1
+ elif entry_type == 0x43:
+ # Alias start of range
+ (device_id_b,) = struct.unpack('<xHx', f.read(4))
+ block_length -= 4
+ bdf_start_range = device_id_b
+ elif entry_type == 0x48:
+ # Special device
+ (handle, device_id_b, variety) = struct.unpack(
+ '<BHB', f.read(4))
+ block_length -= 4
+ if variety == 0x01: # IOAPIC
+ for chip in ioapics:
+ if chip.id == handle:
+ chip.bdf = device_id
+ chip.iommu = len(units) - 1
+ else:
+ # Reserved or ignored entries
+ if entry_type >= 0x40:
+ f.seek(4, os.SEEK_CUR)
+ block_length -= 4
+
+ elif type in [0x20, 0x21, 0x22]:
+ # IVMD block
+ ivmd_fields = struct.unpack('<BBHHHxxxxxxxxQQ', f.read(32))
+ (block_type, block_flags, block_length,
+ device_id, aux_data, mem_addr, mem_len) = ivmd_fields
+ length -= block_length
+
+ if int(block_flags):
+ bdf_str = format_bdf(device_id)
+ print(
+ 'WARNING: Jailhouse doesn\'t support configurable '
+ '(eg. read-only) device memory. Device %s may not '
+ 'work properly, especially in non-root cell.' % bdf_str)
+
+ if block_type == 0x20:
+ # All devices
+ comment = None
+ elif block_type == 0x21:
+ # Selected device
+ comment = 'PCI Device: %s' % format_bdf(device_id)
+ elif block_type == 0x22:
+ # Device range
+ comment = 'PCI Device: %s - %s' % (
+ format_bdf(device_id), format_bdf(aux_data))
+
+ if comment:
+ print('WARNING: Jailhouse doesn\'t support per-device memory '
+ 'regions. The memory at 0x%x will be mapped accessible '
+ 'to all devices.' % mem_addr)
+
+ regions.append(MemRegion(mem_addr, mem_len, 'ACPI IVRS', comment))
+ elif type == 0x40:
+ raise RuntimeError(
+ 'You board uses IVRS Rev. 2 feature Jailhouse doesn\'t '
+ 'support yet. Please report this to '
+ 'jailhouse-dev@googlegroups.com.')
+ else:
+ print(
+ 'WARNING: Skipping unknown IVRS '
+ 'block type 0x%02x' % block_type)
+
+ for d in pcidevices:
+ if d.bdf() not in iommu_skiplist and d.iommu is None:
+ raise RuntimeError(
+ 'PCI device %02x:%02x.%x outside the scope of an '
+ 'IOMMU' % (d.bus, d.dev, d.fn))
+
+ f.close()
+ return units, regions
+
+
def parse_ioports():
pm_timer_base = None
f = input_open('/proc/ioports')
filelist = ' '.join(inputs['files'])
filelist_opt = ' '.join(inputs['files_opt'])
filelist_intel = ' '.join(inputs['files_intel'])
+ filelist_amd = ' '.join(inputs['files_amd'])
tmpl = Template(filename=os.path.join(options.template_dir,
'jailhouse-config-collect.tmpl'))
f.write(tmpl.render(filelist=filelist, filelist_opt=filelist_opt,
- filelist_intel=filelist_intel))
+ filelist_intel=filelist_intel, filelist_amd=filelist_amd))
f.close()
sys.exit(0)
inmatemem = kmg_multiply_str(options.mem_inmates)
hvmem = [0, kmg_multiply_str(options.mem_hv)]
-regions = parse_iomem(pcidevices)
+(regions, dmar_regions) = parse_iomem(pcidevices)
ourmem = parse_kernel_cmdline()
total = hvmem[1] + inmatemem
ioapics = parse_madt()
-if get_cpu_vendor() == 'GenuineIntel':
- (dmar_units, rmrr_regs) = parse_dmar(pcidevices, ioapics)
+vendor = get_cpu_vendor()
+if vendor == 'GenuineIntel':
+ (iommu_units, extra_memregs) = parse_dmar(pcidevices, ioapics,
+ dmar_regions)
else:
- (dmar_units, rmrr_regs) = [], []
-regions += rmrr_regs
-
-for d in pcidevices:
- if get_cpu_vendor() == 'AuthenticAMD':
- d.iommu = 0 # temporary workaround
- if d.iommu is None:
- raise RuntimeError('PCI device %02x:%02x.%x outside the scope of an '
- 'IOMMU' % (d.bus, d.dev, d.fn))
+ (iommu_units, extra_memregs) = parse_ivrs(pcidevices, ioapics)
+regions += extra_memregs
# kernel does not have memmap region, pick one
if ourmem is None:
ourmem = alloc_mem(regions, total)
elif (total > ourmem[1]):
raise RuntimeError('Your memmap reservation is too small you need >="' +
- hex(total) + '"')
+ hex(total) + '". Hint: your kernel cmd line needs '
+ '"memmap=' + hex(total) + '$' + hex(ourmem[0]) + '"')
hvmem[0] = ourmem[0]
f = open(options.file, 'w')
tmpl = Template(filename=os.path.join(options.template_dir,
'root-cell-config.c.tmpl'))
-f.write(tmpl.render(regions=regions,
- ourmem=ourmem,
- argstr=' '.join(sys.argv),
- hvmem=hvmem,
- product=product,
- pcidevices=pcidevices,
- pcicaps=pcicaps,
- cpucount=cpucount,
- irqchips=ioapics,
- pm_timer_base=pm_timer_base,
- mmconfig=mmconfig,
- dmar_units=dmar_units))
+kwargs = {
+ 'regions': regions,
+ 'ourmem': ourmem,
+ 'argstr': ' '.join(sys.argv),
+ 'hvmem': hvmem,
+ 'product': product,
+ 'pcidevices': pcidevices,
+ 'pcicaps': pcicaps,
+ 'cpucount': cpucount,
+ 'irqchips': ioapics,
+ 'pm_timer_base': pm_timer_base,
+ 'mmconfig': mmconfig,
+ 'iommu_units': iommu_units
+}
+
+f.write(tmpl.render(**kwargs))
f.close()