/*
* Jailhouse, a Linux-based partitioning hypervisor
*
- * Copyright (c) Siemens AG, 2013-2015
+ * Copyright (c) Siemens AG, 2013-2016
* Copyright (c) Valentine Sinitsyn, 2014
*
* Authors:
#include <asm/vcpu.h>
#include <asm/vmx.h>
-#define CR0_IDX 0
-#define CR4_IDX 1
+#define CR0_IDX 0
+#define CR4_IDX 1
+
+#define PIO_BITMAP_PAGES 2
static const struct segment invalid_seg = {
.access_rights = 0x10000
[ 0x828/8 ... 0x82f/8 ] = 0x81, /* 0x828, 0x82f */
[ 0x830/8 ... 0x837/8 ] = 0xfd, /* 0x830, 0x832 - 0x837 */
[ 0x838/8 ... 0x83f/8 ] = 0xc1, /* 0x838, 0x83e, 0x83f */
- [ 0x840/8 ... 0x1fff/8 ] = 0,
+ [ 0x840/8 ... 0xd8f/8 ] = 0xff, /* esp. 0xc80 - 0xd8f */
+ [ 0xd90/8 ... 0x1fff/8 ] = 0,
},
[ VMX_MSR_BMP_C000_WRITE ] = {
[ 0/8 ... 0x1fff/8 ] = 0,
unsigned long vmx_pin_ctrl, vmx_basic, maybe1, required1;
unsigned long vmx_entry_ctrl, vmx_exit_ctrl;
- if (!(cpuid_ecx(1) & X86_FEATURE_VMX))
+ if (!(cpuid_ecx(1, 0) & X86_FEATURE_VMX))
return trace_error(-ENODEV);
vmx_basic = read_msr(MSR_IA32_VMX_BASIC);
return trace_error(-EIO);
/* require RDTSCP if present in CPUID */
- if (cpuid_edx(0x80000001) & X86_FEATURE_RDTSCP) {
+ if (cpuid_edx(0x80000001, 0) & X86_FEATURE_RDTSCP) {
enable_rdtscp = SECONDARY_EXEC_RDTSCP;
if (!(vmx_proc_ctrl2 & SECONDARY_EXEC_RDTSCP))
return trace_error(-EIO);
static void ept_set_next_pt(pt_entry_t pte, unsigned long next_pt)
{
- *pte = (next_pt & 0x000ffffffffff000UL) | EPT_FLAG_READ |
- EPT_FLAG_WRITE | EPT_FLAG_EXECUTE;
+ *pte = (next_pt & BIT_MASK(51, 12)) | EPT_FLAG_READ | EPT_FLAG_WRITE |
+ EPT_FLAG_EXECUTE;
}
int vcpu_vendor_init(void)
int vcpu_vendor_cell_init(struct cell *cell)
{
- int err = -ENOMEM;
+ int err;
/* allocate io_bitmap */
- cell->arch.vmx.io_bitmap = page_alloc(&mem_pool, 2);
+ cell->arch.vmx.io_bitmap = page_alloc(&mem_pool, PIO_BITMAP_PAGES);
if (!cell->arch.vmx.io_bitmap)
- return err;
+ return -ENOMEM;
/* build root EPT of cell */
cell->arch.vmx.ept_structs.root_paging = ept_paging;
ok &= vmx_set_cell_config();
- ok &= vmcs_write32(EXCEPTION_BITMAP, 0);
+ /* see vmx_handle_exception_nmi for the interception reason */
+ ok &= vmcs_write32(EXCEPTION_BITMAP,
+ (1 << DB_VECTOR) | (1 << AC_VECTOR));
val = read_msr(MSR_IA32_VMX_EXIT_CTLS);
val |= VM_EXIT_HOST_ADDR_SPACE_SIZE |
int err;
/* make sure all perf counters are off */
- if ((cpuid_eax(0x0a) & 0xff) > 0)
+ if ((cpuid_eax(0x0a, 0) & 0xff) > 0)
write_msr(MSR_IA32_PERF_GLOBAL_CTRL, 0);
if (cpu_data->linux_cr4 & X86_CR4_VMXE)
*/
write_cr0(X86_CR0_HOST_STATE);
write_cr4(X86_CR4_HOST_STATE | X86_CR4_VMXE |
- ((cpuid_ecx(1) & X86_FEATURE_XSAVE) ? X86_CR4_OSXSAVE : 0));
+ ((cpuid_ecx(1, 0) & X86_FEATURE_XSAVE) ?
+ X86_CR4_OSXSAVE : 0));
if (!vmxon(cpu_data)) {
write_cr4(cpu_data->linux_cr4);
__builtin_unreachable();
}
-static void vmx_vcpu_reset(unsigned int sipi_vector)
+void vcpu_vendor_reset(unsigned int sipi_vector)
{
unsigned long val;
bool ok = true;
ok &= vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
ok &= vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
ok &= vmcs_write64(GUEST_PENDING_DBG_EXCEPTIONS, 0);
+ ok &= vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
val = vmcs_read32(VM_ENTRY_CONTROLS);
val &= ~VM_ENTRY_IA32E_MODE;
}
}
-void vcpu_nmi_handler(void)
+static void vmx_preemption_timer_set_enable(bool enable)
{
- u32 pin_based_ctrl;
-
- if (this_cpu_data()->vmx_state != VMCS_READY)
- return;
+ u32 pin_based_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
- pin_based_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
- pin_based_ctrl |= PIN_BASED_VMX_PREEMPTION_TIMER;
+ if (enable)
+ pin_based_ctrl |= PIN_BASED_VMX_PREEMPTION_TIMER;
+ else
+ pin_based_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, pin_based_ctrl);
}
+void vcpu_nmi_handler(void)
+{
+ if (this_cpu_data()->vmx_state == VMCS_READY)
+ vmx_preemption_timer_set_enable(true);
+}
+
void vcpu_park(void)
{
- vmx_vcpu_reset(0);
+ vcpu_vendor_reset(0);
vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_HLT);
}
-static void vmx_disable_preemption_timer(void)
+void vcpu_skip_emulated_instruction(unsigned int inst_len)
{
- u32 pin_based_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
+ vmcs_write64(GUEST_RIP, vmcs_read64(GUEST_RIP) + inst_len);
+}
- pin_based_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
- vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, pin_based_ctrl);
+static void vmx_check_events(void)
+{
+ vmx_preemption_timer_set_enable(false);
+ x86_check_events();
}
-void vcpu_skip_emulated_instruction(unsigned int inst_len)
+static void vmx_handle_exception_nmi(void)
{
- vmcs_write64(GUEST_RIP, vmcs_read64(GUEST_RIP) + inst_len);
+ u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+ if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) {
+ this_cpu_data()->stats[JAILHOUSE_CPU_STAT_VMEXITS_MANAGEMENT]++;
+ asm volatile("int %0" : : "i" (NMI_VECTOR));
+ } else {
+ this_cpu_data()->stats[JAILHOUSE_CPU_STAT_VMEXITS_EXCEPTION]++;
+ /*
+ * Reinject the event straight away. We only intercept #DB and
+ * #AC to prevent that malicious guests can trigger infinite
+ * loops in microcode (see e.g. CVE-2015-5307 and
+ * CVE-2015-8104).
+ */
+ vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+ intr_info & INTR_TO_VECTORING_INFO_MASK);
+ vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
+ vmcs_read32(VM_EXIT_INTR_ERROR_CODE));
+ }
+
+ /*
+ * Check for events even in the exception case in order to maintain
+ * control over the guest if it triggered #DB or #AC loops.
+ */
+ vmx_check_events();
}
static void update_efer(void)
if (vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE) {
pg_structs->root_paging = x86_64_paging;
pg_structs->root_table_gphys =
- vmcs_read64(GUEST_CR3) & 0x000ffffffffff000UL;
+ vmcs_read64(GUEST_CR3) & BIT_MASK(51, 12);
} else if (vmcs_read64(GUEST_CR0) & X86_CR0_PG &&
!(vmcs_read64(GUEST_CR4) & X86_CR4_PAE)) {
pg_structs->root_paging = i386_paging;
pg_structs->root_table_gphys =
- vmcs_read64(GUEST_CR3) & 0xfffff000UL;
+ vmcs_read64(GUEST_CR3) & BIT_MASK(31, 12);
} else {
printk("FATAL: Unsupported paging mode\n");
return false;
void vcpu_handle_exit(struct per_cpu *cpu_data)
{
u32 reason = vmcs_read32(VM_EXIT_REASON);
- int sipi_vector;
cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_TOTAL]++;
switch (reason) {
case EXIT_REASON_EXCEPTION_NMI:
- asm volatile("int %0" : : "i" (NMI_VECTOR));
- /* fall through */
+ vmx_handle_exception_nmi();
+ return;
case EXIT_REASON_PREEMPTION_TIMER:
cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_MANAGEMENT]++;
- vmx_disable_preemption_timer();
- sipi_vector = x86_handle_events(cpu_data);
- if (sipi_vector >= 0) {
- printk("CPU %d received SIPI, vector %x\n",
- cpu_data->cpu_id, sipi_vector);
- vmx_vcpu_reset(sipi_vector);
- vcpu_reset(sipi_vector == APIC_BSP_PSEUDO_SIPI);
- }
- iommu_check_pending_faults();
+ vmx_check_events();
return;
case EXIT_REASON_CPUID:
vcpu_handle_cpuid();
struct vcpu_io_bitmap *iobm)
{
iobm->data = cell->arch.vmx.io_bitmap;
- iobm->size = sizeof(cell->arch.vmx.io_bitmap);
+ iobm->size = PIO_BITMAP_PAGES * PAGE_SIZE;
}
void vcpu_vendor_get_execution_state(struct vcpu_execution_state *x_state)