2 * Jailhouse, a Linux-based partitioning hypervisor
4 * Copyright (c) Siemens AG, 2013
7 * Jan Kiszka <jan.kiszka@siemens.com>
9 * This work is licensed under the terms of the GNU GPL, version 2. See
10 * the COPYING file in the top-level directory.
13 #include <jailhouse/entry.h>
14 #include <jailhouse/paging.h>
15 #include <jailhouse/processor.h>
16 #include <jailhouse/printk.h>
17 #include <jailhouse/string.h>
18 #include <jailhouse/control.h>
19 #include <jailhouse/hypercall.h>
20 #include <jailhouse/mmio.h>
21 #include <jailhouse/pci.h>
23 #include <asm/control.h>
25 #include <asm/ioapic.h>
30 static const struct segment invalid_seg = {
31 .access_rights = 0x10000
34 static u8 __attribute__((aligned(PAGE_SIZE))) msr_bitmap[][0x2000/8] = {
35 [ VMX_MSR_BMP_0000_READ ] = {
36 [ 0/8 ... 0x7ff/8 ] = 0,
37 [ 0x800/8 ... 0x807/8 ] = 0x0c, /* 0x802, 0x803 */
38 [ 0x808/8 ... 0x80f/8 ] = 0xa5, /* 0x808, 0x80a, 0x80d */
39 [ 0x810/8 ... 0x817/8 ] = 0xff, /* 0x810 - 0x817 */
40 [ 0x818/8 ... 0x81f/8 ] = 0xff, /* 0x818 - 0x81f */
41 [ 0x820/8 ... 0x827/8 ] = 0xff, /* 0x820 - 0x827 */
42 [ 0x828/8 ... 0x82f/8 ] = 0x81, /* 0x828, 0x82f */
43 [ 0x830/8 ... 0x837/8 ] = 0xfd, /* 0x830, 0x832 - 0x837 */
44 [ 0x838/8 ... 0x83f/8 ] = 0x43, /* 0x838, 0x839, 0x83e */
45 [ 0x840/8 ... 0x1fff/8 ] = 0,
47 [ VMX_MSR_BMP_C000_READ ] = {
48 [ 0/8 ... 0x1fff/8 ] = 0,
50 [ VMX_MSR_BMP_0000_WRITE ] = {
51 [ 0/8 ... 0x807/8 ] = 0,
52 [ 0x808/8 ... 0x80f/8 ] = 0x89, /* 0x808, 0x80b, 0x80f */
53 [ 0x810/8 ... 0x827/8 ] = 0,
54 [ 0x828/8 ... 0x82f/8 ] = 0x81, /* 0x828, 0x82f */
55 [ 0x830/8 ... 0x837/8 ] = 0xfd, /* 0x830, 0x832 - 0x837 */
56 [ 0x838/8 ... 0x83f/8 ] = 0xc1, /* 0x838, 0x83e, 0x83f */
57 [ 0x840/8 ... 0x1fff/8 ] = 0,
59 [ VMX_MSR_BMP_C000_WRITE ] = {
60 [ 0/8 ... 0x1fff/8 ] = 0,
63 static u8 __attribute__((aligned(PAGE_SIZE))) apic_access_page[PAGE_SIZE];
64 static struct paging ept_paging[EPT_PAGE_DIR_LEVELS];
65 static u32 enable_rdtscp;
67 static bool vmxon(struct per_cpu *cpu_data)
69 unsigned long vmxon_addr;
72 vmxon_addr = page_map_hvirt2phys(&cpu_data->vmxon_region);
77 : "r" (&vmxon_addr), "m" (vmxon_addr)
82 static bool vmcs_clear(struct per_cpu *cpu_data)
84 unsigned long vmcs_addr = page_map_hvirt2phys(&cpu_data->vmcs);
91 : "r" (&vmcs_addr), "m" (vmcs_addr)
96 static bool vmcs_load(struct per_cpu *cpu_data)
98 unsigned long vmcs_addr = page_map_hvirt2phys(&cpu_data->vmcs);
105 : "r" (&vmcs_addr), "m" (vmcs_addr)
110 static inline unsigned long vmcs_read64(unsigned long field)
114 asm volatile("vmread %1,%0" : "=r" (value) : "r" (field) : "cc");
118 static inline u16 vmcs_read16(unsigned long field)
120 return vmcs_read64(field);
123 static inline u32 vmcs_read32(unsigned long field)
125 return vmcs_read64(field);
128 static bool vmcs_write64(unsigned long field, unsigned long val)
136 : "r" (val), "r" (field)
139 printk("FATAL: vmwrite %08lx failed, error %d, caller %p\n",
140 field, vmcs_read32(VM_INSTRUCTION_ERROR),
141 __builtin_return_address(0));
145 static bool vmcs_write16(unsigned long field, u16 value)
147 return vmcs_write64(field, value);
150 static bool vmcs_write32(unsigned long field, u32 value)
152 return vmcs_write64(field, value);
155 static int vmx_check_features(void)
157 unsigned long vmx_proc_ctrl, vmx_proc_ctrl2, ept_cap;
158 unsigned long vmx_pin_ctrl, vmx_basic;
160 if (!(cpuid_ecx(1) & X86_FEATURE_VMX))
163 vmx_basic = read_msr(MSR_IA32_VMX_BASIC);
165 /* require VMCS size <= PAGE_SIZE,
166 * VMCS memory access type == write back and
167 * availability of TRUE_*_CTLS */
168 if (((vmx_basic >> 32) & 0x1fff) > PAGE_SIZE ||
169 ((vmx_basic >> 50) & 0xf) != EPT_TYPE_WRITEBACK ||
170 !(vmx_basic & (1UL << 55)))
173 /* require NMI exiting and preemption timer support */
174 vmx_pin_ctrl = read_msr(MSR_IA32_VMX_PINBASED_CTLS) >> 32;
175 if (!(vmx_pin_ctrl & PIN_BASED_NMI_EXITING) ||
176 !(vmx_pin_ctrl & PIN_BASED_VMX_PREEMPTION_TIMER))
179 /* require I/O and MSR bitmap as well as secondary controls support */
180 vmx_proc_ctrl = read_msr(MSR_IA32_VMX_PROCBASED_CTLS) >> 32;
181 if (!(vmx_proc_ctrl & CPU_BASED_USE_IO_BITMAPS) ||
182 !(vmx_proc_ctrl & CPU_BASED_USE_MSR_BITMAPS) ||
183 !(vmx_proc_ctrl & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
186 /* require disabling of CR3 access interception */
187 vmx_proc_ctrl = read_msr(MSR_IA32_VMX_TRUE_PROCBASED_CTLS);
189 (CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING))
192 /* require APIC access, EPT and unrestricted guest mode support */
193 vmx_proc_ctrl2 = read_msr(MSR_IA32_VMX_PROCBASED_CTLS2) >> 32;
194 ept_cap = read_msr(MSR_IA32_VMX_EPT_VPID_CAP);
195 if (!(vmx_proc_ctrl2 & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) ||
196 !(vmx_proc_ctrl2 & SECONDARY_EXEC_ENABLE_EPT) ||
197 (ept_cap & EPT_MANDATORY_FEATURES) != EPT_MANDATORY_FEATURES ||
198 !(ept_cap & (EPT_INVEPT_SINGLE | EPT_INVEPT_GLOBAL)) ||
199 !(vmx_proc_ctrl2 & SECONDARY_EXEC_UNRESTRICTED_GUEST))
202 /* require RDTSCP if present in CPUID */
203 if (cpuid_edx(0x80000001) & X86_FEATURE_RDTSCP) {
204 enable_rdtscp = SECONDARY_EXEC_RDTSCP;
205 if (!(vmx_proc_ctrl2 & SECONDARY_EXEC_RDTSCP))
209 /* require activity state HLT */
210 if (!(read_msr(MSR_IA32_VMX_MISC) & VMX_MISC_ACTIVITY_HLT))
216 static void ept_set_next_pt(pt_entry_t pte, unsigned long next_pt)
218 *pte = (next_pt & 0x000ffffffffff000UL) | EPT_FLAG_READ |
219 EPT_FLAG_WRITE | EPT_FLAG_EXECUTE;
227 err = vmx_check_features();
231 /* derive ept_paging from very similar x86_64_paging */
232 memcpy(ept_paging, x86_64_paging, sizeof(ept_paging));
233 for (n = 0; n < EPT_PAGE_DIR_LEVELS; n++)
234 ept_paging[n].set_next_pt = ept_set_next_pt;
235 if (!(read_msr(MSR_IA32_VMX_EPT_VPID_CAP) & EPT_1G_PAGES))
236 ept_paging[1].page_size = 0;
237 if (!(read_msr(MSR_IA32_VMX_EPT_VPID_CAP) & EPT_2M_PAGES))
238 ept_paging[2].page_size = 0;
241 /* allow direct x2APIC access except for ICR writes */
242 memset(&msr_bitmap[VMX_MSR_BMP_0000_READ][MSR_X2APIC_BASE/8],
243 0, (MSR_X2APIC_END - MSR_X2APIC_BASE + 1)/8);
244 memset(&msr_bitmap[VMX_MSR_BMP_0000_WRITE][MSR_X2APIC_BASE/8],
245 0, (MSR_X2APIC_END - MSR_X2APIC_BASE + 1)/8);
246 msr_bitmap[VMX_MSR_BMP_0000_WRITE][MSR_X2APIC_ICR/8] = 0x01;
249 return vmx_cell_init(&root_cell);
252 unsigned long arch_page_map_gphys2phys(struct per_cpu *cpu_data,
255 return page_map_virt2phys(&cpu_data->cell->vmx.ept_structs, gphys);
258 int vmx_cell_init(struct cell *cell)
260 const u8 *pio_bitmap = jailhouse_cell_pio_bitmap(cell->config);
261 u32 pio_bitmap_size = cell->config->pio_bitmap_size;
262 unsigned int n, pm_timer_addr;
267 /* PM timer has to be provided */
268 if (system_config->platform_info.x86.pm_timer_address == 0)
271 /* build root EPT of cell */
272 cell->vmx.ept_structs.root_paging = ept_paging;
273 cell->vmx.ept_structs.root_table = page_alloc(&mem_pool, 1);
274 if (!cell->vmx.ept_structs.root_table)
277 err = page_map_create(&cell->vmx.ept_structs,
278 page_map_hvirt2phys(apic_access_page),
279 PAGE_SIZE, XAPIC_BASE,
280 EPT_FLAG_READ|EPT_FLAG_WRITE|EPT_FLAG_WB_TYPE,
281 PAGE_MAP_NON_COHERENT);
287 memset(cell->vmx.io_bitmap, -1, sizeof(cell->vmx.io_bitmap));
289 for (n = 0; n < 2; n++) {
290 size = pio_bitmap_size <= PAGE_SIZE ?
291 pio_bitmap_size : PAGE_SIZE;
292 memcpy(cell->vmx.io_bitmap + n * PAGE_SIZE, pio_bitmap, size);
294 pio_bitmap_size -= size;
297 if (cell != &root_cell) {
299 * Shrink PIO access of root cell corresponding to new cell's
302 pio_bitmap = jailhouse_cell_pio_bitmap(cell->config);
303 pio_bitmap_size = cell->config->pio_bitmap_size;
304 for (b = root_cell.vmx.io_bitmap; pio_bitmap_size > 0;
305 b++, pio_bitmap++, pio_bitmap_size--)
309 /* permit access to the PM timer */
310 pm_timer_addr = system_config->platform_info.x86.pm_timer_address;
311 for (n = 0; n < 4; n++, pm_timer_addr++) {
312 b = cell->vmx.io_bitmap;
313 b[pm_timer_addr / 8] &= ~(1 << (pm_timer_addr % 8));
319 int vmx_map_memory_region(struct cell *cell,
320 const struct jailhouse_memory *mem)
322 u64 phys_start = mem->phys_start;
323 u32 flags = EPT_FLAG_WB_TYPE;
325 if (mem->flags & JAILHOUSE_MEM_READ)
326 flags |= EPT_FLAG_READ;
327 if (mem->flags & JAILHOUSE_MEM_WRITE)
328 flags |= EPT_FLAG_WRITE;
329 if (mem->flags & JAILHOUSE_MEM_EXECUTE)
330 flags |= EPT_FLAG_EXECUTE;
331 if (mem->flags & JAILHOUSE_MEM_COMM_REGION)
332 phys_start = page_map_hvirt2phys(&cell->comm_page);
334 return page_map_create(&cell->vmx.ept_structs, phys_start, mem->size,
335 mem->virt_start, flags, PAGE_MAP_NON_COHERENT);
338 int vmx_unmap_memory_region(struct cell *cell,
339 const struct jailhouse_memory *mem)
341 return page_map_destroy(&cell->vmx.ept_structs, mem->virt_start,
342 mem->size, PAGE_MAP_NON_COHERENT);
345 void vmx_cell_exit(struct cell *cell)
347 const u8 *root_pio_bitmap =
348 jailhouse_cell_pio_bitmap(root_cell.config);
349 const u8 *pio_bitmap = jailhouse_cell_pio_bitmap(cell->config);
350 u32 pio_bitmap_size = cell->config->pio_bitmap_size;
353 page_map_destroy(&cell->vmx.ept_structs, XAPIC_BASE, PAGE_SIZE,
354 PAGE_MAP_NON_COHERENT);
356 if (root_cell.config->pio_bitmap_size < pio_bitmap_size)
357 pio_bitmap_size = root_cell.config->pio_bitmap_size;
359 for (b = root_cell.vmx.io_bitmap; pio_bitmap_size > 0;
360 b++, pio_bitmap++, root_pio_bitmap++, pio_bitmap_size--)
361 *b &= *pio_bitmap | *root_pio_bitmap;
363 page_free(&mem_pool, cell->vmx.ept_structs.root_table, 1);
366 void vmx_invept(void)
368 unsigned long ept_cap = read_msr(MSR_IA32_VMX_EPT_VPID_CAP);
376 descriptor.reserved = 0;
377 if (ept_cap & EPT_INVEPT_SINGLE) {
378 type = VMX_INVEPT_SINGLE;
379 descriptor.eptp = vmcs_read64(EPT_POINTER);
381 type = VMX_INVEPT_GLOBAL;
388 : "r" (&descriptor), "r" (type)
392 panic_printk("FATAL: invept failed, error %d\n",
393 vmcs_read32(VM_INSTRUCTION_ERROR));
398 static bool vmx_set_guest_cr(int cr, unsigned long val)
400 unsigned long fixed0, fixed1, required1;
403 fixed0 = read_msr(cr ? MSR_IA32_VMX_CR4_FIXED0
404 : MSR_IA32_VMX_CR0_FIXED0);
405 fixed1 = read_msr(cr ? MSR_IA32_VMX_CR4_FIXED1
406 : MSR_IA32_VMX_CR0_FIXED1);
407 required1 = fixed0 & fixed1;
409 fixed1 &= ~(X86_CR0_NW | X86_CR0_CD);
410 required1 &= ~(X86_CR0_PE | X86_CR0_PG);
411 required1 |= X86_CR0_ET;
413 /* keeps the hypervisor visible */
416 ok &= vmcs_write64(cr ? GUEST_CR4 : GUEST_CR0,
417 (val & fixed1) | required1);
418 ok &= vmcs_write64(cr ? CR4_READ_SHADOW : CR0_READ_SHADOW, val);
419 ok &= vmcs_write64(cr ? CR4_GUEST_HOST_MASK : CR0_GUEST_HOST_MASK,
420 required1 | ~fixed1);
425 static bool vmx_set_cell_config(struct cell *cell)
430 io_bitmap = cell->vmx.io_bitmap;
431 ok &= vmcs_write64(IO_BITMAP_A, page_map_hvirt2phys(io_bitmap));
432 ok &= vmcs_write64(IO_BITMAP_B,
433 page_map_hvirt2phys(io_bitmap + PAGE_SIZE));
435 ok &= vmcs_write64(EPT_POINTER,
436 page_map_hvirt2phys(cell->vmx.ept_structs.root_table) |
437 EPT_TYPE_WRITEBACK | EPT_PAGE_WALK_LEN);
442 static bool vmx_set_guest_segment(const struct segment *seg,
443 unsigned long selector_field)
447 ok &= vmcs_write16(selector_field, seg->selector);
448 ok &= vmcs_write64(selector_field + GUEST_SEG_BASE, seg->base);
449 ok &= vmcs_write32(selector_field + GUEST_SEG_LIMIT, seg->limit);
450 ok &= vmcs_write32(selector_field + GUEST_SEG_AR_BYTES,
455 static bool vmcs_setup(struct per_cpu *cpu_data)
457 struct desc_table_reg dtr;
461 ok &= vmcs_write64(HOST_CR0, read_cr0());
462 ok &= vmcs_write64(HOST_CR3, read_cr3());
463 ok &= vmcs_write64(HOST_CR4, read_cr4());
465 ok &= vmcs_write16(HOST_CS_SELECTOR, GDT_DESC_CODE * 8);
466 ok &= vmcs_write16(HOST_DS_SELECTOR, 0);
467 ok &= vmcs_write16(HOST_ES_SELECTOR, 0);
468 ok &= vmcs_write16(HOST_SS_SELECTOR, 0);
469 ok &= vmcs_write16(HOST_FS_SELECTOR, 0);
470 ok &= vmcs_write16(HOST_GS_SELECTOR, 0);
471 ok &= vmcs_write16(HOST_TR_SELECTOR, GDT_DESC_TSS * 8);
473 ok &= vmcs_write64(HOST_FS_BASE, 0);
474 ok &= vmcs_write64(HOST_GS_BASE, 0);
475 ok &= vmcs_write64(HOST_TR_BASE, 0);
478 ok &= vmcs_write64(HOST_GDTR_BASE, dtr.base);
480 ok &= vmcs_write64(HOST_IDTR_BASE, dtr.base);
482 ok &= vmcs_write64(HOST_IA32_EFER, EFER_LMA | EFER_LME);
484 ok &= vmcs_write32(HOST_IA32_SYSENTER_CS, 0);
485 ok &= vmcs_write64(HOST_IA32_SYSENTER_EIP, 0);
486 ok &= vmcs_write64(HOST_IA32_SYSENTER_ESP, 0);
488 ok &= vmcs_write64(HOST_RSP, (unsigned long)cpu_data->stack +
489 sizeof(cpu_data->stack));
490 ok &= vmcs_write64(HOST_RIP, (unsigned long)vm_exit);
492 ok &= vmx_set_guest_cr(0, read_cr0());
493 ok &= vmx_set_guest_cr(4, read_cr4());
495 ok &= vmcs_write64(GUEST_CR3, cpu_data->linux_cr3);
497 ok &= vmx_set_guest_segment(&cpu_data->linux_cs, GUEST_CS_SELECTOR);
498 ok &= vmx_set_guest_segment(&cpu_data->linux_ds, GUEST_DS_SELECTOR);
499 ok &= vmx_set_guest_segment(&cpu_data->linux_es, GUEST_ES_SELECTOR);
500 ok &= vmx_set_guest_segment(&cpu_data->linux_fs, GUEST_FS_SELECTOR);
501 ok &= vmx_set_guest_segment(&cpu_data->linux_gs, GUEST_GS_SELECTOR);
502 ok &= vmx_set_guest_segment(&invalid_seg, GUEST_SS_SELECTOR);
503 ok &= vmx_set_guest_segment(&cpu_data->linux_tss, GUEST_TR_SELECTOR);
504 ok &= vmx_set_guest_segment(&invalid_seg, GUEST_LDTR_SELECTOR);
506 ok &= vmcs_write64(GUEST_GDTR_BASE, cpu_data->linux_gdtr.base);
507 ok &= vmcs_write32(GUEST_GDTR_LIMIT, cpu_data->linux_gdtr.limit);
508 ok &= vmcs_write64(GUEST_IDTR_BASE, cpu_data->linux_idtr.base);
509 ok &= vmcs_write32(GUEST_IDTR_LIMIT, cpu_data->linux_idtr.limit);
511 ok &= vmcs_write64(GUEST_RFLAGS, 0x02);
512 ok &= vmcs_write64(GUEST_RSP, cpu_data->linux_sp +
513 (NUM_ENTRY_REGS + 1) * sizeof(unsigned long));
514 ok &= vmcs_write64(GUEST_RIP, cpu_data->linux_ip);
516 ok &= vmcs_write32(GUEST_SYSENTER_CS,
517 read_msr(MSR_IA32_SYSENTER_CS));
518 ok &= vmcs_write64(GUEST_SYSENTER_EIP,
519 read_msr(MSR_IA32_SYSENTER_EIP));
520 ok &= vmcs_write64(GUEST_SYSENTER_ESP,
521 read_msr(MSR_IA32_SYSENTER_ESP));
523 ok &= vmcs_write64(GUEST_DR7, 0x00000400);
524 ok &= vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
526 ok &= vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
527 ok &= vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
528 ok &= vmcs_write64(GUEST_PENDING_DBG_EXCEPTIONS, 0);
530 ok &= vmcs_write64(GUEST_IA32_EFER, cpu_data->linux_efer);
532 // TODO: switch PAT, PERF */
534 ok &= vmcs_write64(VMCS_LINK_POINTER, -1UL);
535 ok &= vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
537 val = read_msr(MSR_IA32_VMX_PINBASED_CTLS);
538 val |= PIN_BASED_NMI_EXITING;
539 ok &= vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, val);
541 ok &= vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0);
543 val = read_msr(MSR_IA32_VMX_PROCBASED_CTLS);
544 val |= CPU_BASED_USE_IO_BITMAPS | CPU_BASED_USE_MSR_BITMAPS |
545 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
546 val &= ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
547 ok &= vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, val);
549 ok &= vmcs_write64(MSR_BITMAP, page_map_hvirt2phys(msr_bitmap));
551 val = read_msr(MSR_IA32_VMX_PROCBASED_CTLS2);
552 val |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
553 SECONDARY_EXEC_ENABLE_EPT | SECONDARY_EXEC_UNRESTRICTED_GUEST |
555 ok &= vmcs_write32(SECONDARY_VM_EXEC_CONTROL, val);
557 ok &= vmcs_write64(APIC_ACCESS_ADDR,
558 page_map_hvirt2phys(apic_access_page));
560 ok &= vmx_set_cell_config(cpu_data->cell);
562 ok &= vmcs_write32(EXCEPTION_BITMAP, 0);
564 val = read_msr(MSR_IA32_VMX_EXIT_CTLS);
565 val |= VM_EXIT_HOST_ADDR_SPACE_SIZE | VM_EXIT_SAVE_IA32_EFER |
566 VM_EXIT_LOAD_IA32_EFER;
567 ok &= vmcs_write32(VM_EXIT_CONTROLS, val);
569 ok &= vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
570 ok &= vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
571 ok &= vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
573 val = read_msr(MSR_IA32_VMX_ENTRY_CTLS);
574 val |= VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER;
575 ok &= vmcs_write32(VM_ENTRY_CONTROLS, val);
577 ok &= vmcs_write64(CR4_GUEST_HOST_MASK, 0);
579 ok &= vmcs_write32(CR3_TARGET_COUNT, 0);
584 int vmx_cpu_init(struct per_cpu *cpu_data)
586 unsigned long cr4, feature_ctrl, mask;
591 if (cr4 & X86_CR4_VMXE)
594 err = vmx_check_features();
598 revision_id = (u32)read_msr(MSR_IA32_VMX_BASIC);
599 cpu_data->vmxon_region.revision_id = revision_id;
600 cpu_data->vmxon_region.shadow_indicator = 0;
601 cpu_data->vmcs.revision_id = revision_id;
602 cpu_data->vmcs.shadow_indicator = 0;
604 // TODO: validate CR0
606 /* Note: We assume that TXT is off */
607 feature_ctrl = read_msr(MSR_IA32_FEATURE_CONTROL);
608 mask = FEATURE_CONTROL_LOCKED |
609 FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
611 if ((feature_ctrl & mask) != mask) {
612 if (feature_ctrl & FEATURE_CONTROL_LOCKED)
615 feature_ctrl |= mask;
616 write_msr(MSR_IA32_FEATURE_CONTROL, feature_ctrl);
619 write_cr4(cr4 | X86_CR4_VMXE);
620 // TODO: validate CR4
622 if (!vmxon(cpu_data)) {
627 cpu_data->vmx_state = VMXON;
629 if (!vmcs_clear(cpu_data) ||
630 !vmcs_load(cpu_data) ||
631 !vmcs_setup(cpu_data))
634 cpu_data->vmx_state = VMCS_READY;
639 void vmx_cpu_exit(struct per_cpu *cpu_data)
641 if (cpu_data->vmx_state == VMXOFF)
644 cpu_data->vmx_state = VMXOFF;
645 /* Write vmx_state to ensure that vmx_schedule_vmexit stops accessing
646 * the VMCS (a compiler barrier would be sufficient, in fact). */
649 vmcs_clear(cpu_data);
650 asm volatile("vmxoff" : : : "cc");
651 write_cr4(read_cr4() & ~X86_CR4_VMXE);
654 void vmx_cpu_activate_vmm(struct per_cpu *cpu_data)
656 /* We enter Linux at the point arch_entry would return to as well.
657 * rax is cleared to signal success to the caller. */
659 "mov (%%rdi),%%r15\n\t"
660 "mov 0x8(%%rdi),%%r14\n\t"
661 "mov 0x10(%%rdi),%%r13\n\t"
662 "mov 0x18(%%rdi),%%r12\n\t"
663 "mov 0x20(%%rdi),%%rbx\n\t"
664 "mov 0x28(%%rdi),%%rbp\n\t"
668 : "a" (0), "D" (cpu_data->linux_reg)
669 : "memory", "r15", "r14", "r13", "r12", "rbx", "rbp", "cc");
671 panic_printk("FATAL: vmlaunch failed, error %d\n",
672 vmcs_read32(VM_INSTRUCTION_ERROR));
673 panic_stop(cpu_data);
676 static void __attribute__((noreturn))
677 vmx_cpu_deactivate_vmm(struct registers *guest_regs, struct per_cpu *cpu_data)
679 unsigned long *stack = (unsigned long *)vmcs_read64(GUEST_RSP);
680 unsigned long linux_ip = vmcs_read64(GUEST_RIP);
682 cpu_data->linux_cr3 = vmcs_read64(GUEST_CR3);
684 cpu_data->linux_gdtr.base = vmcs_read64(GUEST_GDTR_BASE);
685 cpu_data->linux_gdtr.limit = vmcs_read64(GUEST_GDTR_LIMIT);
686 cpu_data->linux_idtr.base = vmcs_read64(GUEST_IDTR_BASE);
687 cpu_data->linux_idtr.limit = vmcs_read64(GUEST_IDTR_LIMIT);
689 cpu_data->linux_cs.selector = vmcs_read32(GUEST_CS_SELECTOR);
691 cpu_data->linux_tss.selector = vmcs_read32(GUEST_TR_SELECTOR);
693 cpu_data->linux_efer = vmcs_read64(GUEST_IA32_EFER);
694 cpu_data->linux_fs.base = vmcs_read64(GUEST_FS_BASE);
695 cpu_data->linux_gs.base = vmcs_read64(GUEST_GS_BASE);
697 cpu_data->linux_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
698 cpu_data->linux_sysenter_eip = vmcs_read64(GUEST_SYSENTER_EIP);
699 cpu_data->linux_sysenter_esp = vmcs_read64(GUEST_SYSENTER_ESP);
701 cpu_data->linux_ds.selector = vmcs_read16(GUEST_DS_SELECTOR);
702 cpu_data->linux_es.selector = vmcs_read16(GUEST_ES_SELECTOR);
703 cpu_data->linux_fs.selector = vmcs_read16(GUEST_FS_SELECTOR);
704 cpu_data->linux_gs.selector = vmcs_read16(GUEST_GS_SELECTOR);
706 arch_cpu_restore(cpu_data);
712 "mov %%rbx,%%rsp\n\t"
728 "mov %%rax,%%rsp\n\t"
729 "xor %%rax,%%rax\n\t"
731 : : "a" (stack), "b" (guest_regs));
732 __builtin_unreachable();
735 static void vmx_cpu_reset(struct per_cpu *cpu_data, unsigned int sipi_vector)
740 ok &= vmx_set_guest_cr(0, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
741 ok &= vmx_set_guest_cr(4, 0);
743 ok &= vmcs_write64(GUEST_CR3, 0);
745 ok &= vmcs_write64(GUEST_RFLAGS, 0x02);
746 ok &= vmcs_write64(GUEST_RSP, 0);
749 if (sipi_vector == APIC_BSP_PSEUDO_SIPI) {
753 ok &= vmcs_write64(GUEST_RIP, val);
755 ok &= vmcs_write16(GUEST_CS_SELECTOR, sipi_vector << 8);
756 ok &= vmcs_write64(GUEST_CS_BASE, sipi_vector << 12);
757 ok &= vmcs_write32(GUEST_CS_LIMIT, 0xffff);
758 ok &= vmcs_write32(GUEST_CS_AR_BYTES, 0x0009b);
760 ok &= vmcs_write16(GUEST_DS_SELECTOR, 0);
761 ok &= vmcs_write64(GUEST_DS_BASE, 0);
762 ok &= vmcs_write32(GUEST_DS_LIMIT, 0xffff);
763 ok &= vmcs_write32(GUEST_DS_AR_BYTES, 0x00093);
765 ok &= vmcs_write16(GUEST_ES_SELECTOR, 0);
766 ok &= vmcs_write64(GUEST_ES_BASE, 0);
767 ok &= vmcs_write32(GUEST_ES_LIMIT, 0xffff);
768 ok &= vmcs_write32(GUEST_ES_AR_BYTES, 0x00093);
770 ok &= vmcs_write16(GUEST_FS_SELECTOR, 0);
771 ok &= vmcs_write64(GUEST_FS_BASE, 0);
772 ok &= vmcs_write32(GUEST_FS_LIMIT, 0xffff);
773 ok &= vmcs_write32(GUEST_FS_AR_BYTES, 0x00093);
775 ok &= vmcs_write16(GUEST_GS_SELECTOR, 0);
776 ok &= vmcs_write64(GUEST_GS_BASE, 0);
777 ok &= vmcs_write32(GUEST_GS_LIMIT, 0xffff);
778 ok &= vmcs_write32(GUEST_GS_AR_BYTES, 0x00093);
780 ok &= vmcs_write16(GUEST_SS_SELECTOR, 0);
781 ok &= vmcs_write64(GUEST_SS_BASE, 0);
782 ok &= vmcs_write32(GUEST_SS_LIMIT, 0xffff);
783 ok &= vmcs_write32(GUEST_SS_AR_BYTES, 0x00093);
785 ok &= vmcs_write16(GUEST_TR_SELECTOR, 0);
786 ok &= vmcs_write64(GUEST_TR_BASE, 0);
787 ok &= vmcs_write32(GUEST_TR_LIMIT, 0xffff);
788 ok &= vmcs_write32(GUEST_TR_AR_BYTES, 0x0008b);
790 ok &= vmcs_write16(GUEST_LDTR_SELECTOR, 0);
791 ok &= vmcs_write64(GUEST_LDTR_BASE, 0);
792 ok &= vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
793 ok &= vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
795 ok &= vmcs_write64(GUEST_GDTR_BASE, 0);
796 ok &= vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
797 ok &= vmcs_write64(GUEST_IDTR_BASE, 0);
798 ok &= vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
800 ok &= vmcs_write64(GUEST_IA32_EFER, 0);
802 ok &= vmcs_write32(GUEST_SYSENTER_CS, 0);
803 ok &= vmcs_write64(GUEST_SYSENTER_EIP, 0);
804 ok &= vmcs_write64(GUEST_SYSENTER_ESP, 0);
806 ok &= vmcs_write64(GUEST_DR7, 0x00000400);
807 ok &= vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
809 ok &= vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
810 ok &= vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
811 ok &= vmcs_write64(GUEST_PENDING_DBG_EXCEPTIONS, 0);
813 val = vmcs_read32(VM_ENTRY_CONTROLS);
814 val &= ~VM_ENTRY_IA32E_MODE;
815 ok &= vmcs_write32(VM_ENTRY_CONTROLS, val);
817 ok &= vmx_set_cell_config(cpu_data->cell);
820 panic_printk("FATAL: CPU reset failed\n");
821 panic_stop(cpu_data);
825 void vmx_schedule_vmexit(struct per_cpu *cpu_data)
829 if (cpu_data->vmx_state != VMCS_READY)
832 pin_based_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
833 pin_based_ctrl |= PIN_BASED_VMX_PREEMPTION_TIMER;
834 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, pin_based_ctrl);
837 void vmx_cpu_park(struct per_cpu *cpu_data)
839 vmx_cpu_reset(cpu_data, 0);
840 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_HLT);
843 static void vmx_disable_preemption_timer(void)
845 u32 pin_based_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
847 pin_based_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
848 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, pin_based_ctrl);
851 static void vmx_skip_emulated_instruction(unsigned int inst_len)
853 vmcs_write64(GUEST_RIP, vmcs_read64(GUEST_RIP) + inst_len);
856 static void update_efer(void)
858 unsigned long efer = vmcs_read64(GUEST_IA32_EFER);
860 if ((efer & (EFER_LME | EFER_LMA)) != EFER_LME)
864 vmcs_write64(GUEST_IA32_EFER, efer);
865 vmcs_write32(VM_ENTRY_CONTROLS,
866 vmcs_read32(VM_ENTRY_CONTROLS) | VM_ENTRY_IA32E_MODE);
869 static void vmx_handle_hypercall(struct registers *guest_regs,
870 struct per_cpu *cpu_data)
872 unsigned long code = guest_regs->rax;
874 vmx_skip_emulated_instruction(X86_INST_LEN_VMCALL);
876 if ((!(vmcs_read64(GUEST_IA32_EFER) & EFER_LMA) &&
877 vmcs_read64(GUEST_RFLAGS) & X86_RFLAGS_VM) ||
878 (vmcs_read16(GUEST_CS_SELECTOR) & 3) != 0) {
879 guest_regs->rax = -EPERM;
883 guest_regs->rax = hypercall(cpu_data, code, guest_regs->rdi,
885 if (guest_regs->rax == -ENOSYS)
886 printk("CPU %d: Unknown vmcall %d, RIP: %p\n",
887 cpu_data->cpu_id, code,
888 vmcs_read64(GUEST_RIP) - X86_INST_LEN_VMCALL);
890 if (code == JAILHOUSE_HC_DISABLE && guest_regs->rax == 0)
891 vmx_cpu_deactivate_vmm(guest_regs, cpu_data);
894 static bool vmx_handle_cr(struct registers *guest_regs,
895 struct per_cpu *cpu_data)
897 u64 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
898 unsigned long cr, reg, val;
900 cr = exit_qualification & 0xf;
901 reg = (exit_qualification >> 8) & 0xf;
903 switch ((exit_qualification >> 4) & 3) {
904 case 0: /* move to cr */
906 val = vmcs_read64(GUEST_RSP);
908 val = ((unsigned long *)guest_regs)[15 - reg];
910 if (cr == 0 || cr == 4) {
911 vmx_skip_emulated_instruction(X86_INST_LEN_MOV_TO_CR);
912 /* TODO: check for #GP reasons */
913 vmx_set_guest_cr(cr, val);
914 if (cr == 0 && val & X86_CR0_PG)
922 panic_printk("FATAL: Unhandled CR access, qualification %x\n",
928 vmx_get_guest_paging_structs(struct guest_paging_structures *pg_structs)
930 if (vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE) {
931 pg_structs->root_paging = x86_64_paging;
932 pg_structs->root_table_gphys =
933 vmcs_read64(GUEST_CR3) & 0x000ffffffffff000UL;
934 } else if (vmcs_read64(GUEST_CR0) & X86_CR0_PG &&
935 !(vmcs_read64(GUEST_CR4) & X86_CR4_PAE)) {
936 pg_structs->root_paging = i386_paging;
937 pg_structs->root_table_gphys =
938 vmcs_read64(GUEST_CR3) & 0xfffff000UL;
940 printk("FATAL: Unsupported paging mode\n");
946 static bool vmx_handle_apic_access(struct registers *guest_regs,
947 struct per_cpu *cpu_data)
949 struct guest_paging_structures pg_structs;
950 unsigned int inst_len, offset;
954 qualification = vmcs_read64(EXIT_QUALIFICATION);
956 switch (qualification & APIC_ACCESS_TYPE_MASK) {
957 case APIC_ACCESS_TYPE_LINEAR_READ:
958 case APIC_ACCESS_TYPE_LINEAR_WRITE:
959 is_write = !!(qualification & APIC_ACCESS_TYPE_LINEAR_WRITE);
960 offset = qualification & APIC_ACCESS_OFFSET_MASK;
964 if (!vmx_get_guest_paging_structs(&pg_structs))
967 inst_len = apic_mmio_access(guest_regs, cpu_data,
968 vmcs_read64(GUEST_RIP),
969 &pg_structs, offset >> 4,
974 vmx_skip_emulated_instruction(inst_len);
977 panic_printk("FATAL: Unhandled APIC access, "
978 "qualification %x\n", qualification);
982 static void dump_vm_exit_details(u32 reason)
984 panic_printk("qualification %x\n", vmcs_read64(EXIT_QUALIFICATION));
985 panic_printk("vectoring info: %x interrupt info: %x\n",
986 vmcs_read32(IDT_VECTORING_INFO_FIELD),
987 vmcs_read32(VM_EXIT_INTR_INFO));
988 if (reason == EXIT_REASON_EPT_VIOLATION ||
989 reason == EXIT_REASON_EPT_MISCONFIG)
990 panic_printk("guest phys addr %p guest linear addr: %p\n",
991 vmcs_read64(GUEST_PHYSICAL_ADDRESS),
992 vmcs_read64(GUEST_LINEAR_ADDRESS));
995 static void dump_guest_regs(struct registers *guest_regs)
997 panic_printk("RIP: %p RSP: %p FLAGS: %x\n", vmcs_read64(GUEST_RIP),
998 vmcs_read64(GUEST_RSP), vmcs_read64(GUEST_RFLAGS));
999 panic_printk("RAX: %p RBX: %p RCX: %p\n", guest_regs->rax,
1000 guest_regs->rbx, guest_regs->rcx);
1001 panic_printk("RDX: %p RSI: %p RDI: %p\n", guest_regs->rdx,
1002 guest_regs->rsi, guest_regs->rdi);
1003 panic_printk("CS: %x BASE: %p AR-BYTES: %x EFER.LMA %d\n",
1004 vmcs_read64(GUEST_CS_SELECTOR),
1005 vmcs_read64(GUEST_CS_BASE),
1006 vmcs_read32(GUEST_CS_AR_BYTES),
1007 !!(vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE));
1008 panic_printk("CR0: %p CR3: %p CR4: %p\n", vmcs_read64(GUEST_CR0),
1009 vmcs_read64(GUEST_CR3), vmcs_read64(GUEST_CR4));
1010 panic_printk("EFER: %p\n", vmcs_read64(GUEST_IA32_EFER));
1013 static bool vmx_handle_io_access(struct registers *guest_regs,
1014 struct per_cpu *cpu_data)
1016 /* parse exit qualification for I/O instructions (see SDM, 27.2.1 ) */
1017 u64 exitq = vmcs_read64(EXIT_QUALIFICATION);
1018 u16 port = (exitq >> 16) & 0xFFFF;
1019 bool dir_in = (exitq & 0x8) >> 3;
1020 unsigned int size = (exitq & 0x3) + 1;
1022 /* string and REP-prefixed instructions are not supported */
1024 goto invalid_access;
1026 if (x86_pci_config_handler(guest_regs, cpu_data->cell, port, dir_in,
1028 vmx_skip_emulated_instruction(
1029 vmcs_read64(VM_EXIT_INSTRUCTION_LEN));
1034 panic_printk("FATAL: Invalid PIO %s, port: %x size: %d\n",
1035 dir_in ? "read" : "write", port, size);
1036 panic_printk("PCI address port: %x\n",
1037 cpu_data->cell->pci_addr_port_val);
1041 static bool vmx_handle_ept_violation(struct registers *guest_regs,
1042 struct per_cpu *cpu_data)
1044 u64 phys_addr = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
1045 u64 exitq = vmcs_read64(EXIT_QUALIFICATION);
1046 struct guest_paging_structures pg_structs;
1047 struct mmio_access access;
1052 /* We don't enable dirty/accessed bit updated in EPTP, so only read
1053 * of write flags can be set, not both. */
1054 is_write = !!(exitq & 0x2);
1056 if (!vmx_get_guest_paging_structs(&pg_structs))
1057 goto invalid_access;
1059 access = mmio_parse(cpu_data, vmcs_read64(GUEST_RIP),
1060 &pg_structs, is_write);
1061 if (!access.inst_len || access.size != 4)
1062 goto invalid_access;
1065 val = ((unsigned long *)guest_regs)[access.reg];
1067 result = ioapic_access_handler(cpu_data->cell, is_write, phys_addr,
1070 result = pci_mmio_access_handler(cpu_data->cell, is_write,
1075 ((unsigned long *)guest_regs)[access.reg] = val;
1076 vmx_skip_emulated_instruction(
1077 vmcs_read64(VM_EXIT_INSTRUCTION_LEN));
1082 /* report only unhandled access failures */
1084 panic_printk("FATAL: Invalid MMIO/RAM %s, addr: %p\n",
1085 is_write ? "write" : "read", phys_addr);
1089 void vmx_handle_exit(struct registers *guest_regs, struct per_cpu *cpu_data)
1091 u32 reason = vmcs_read32(VM_EXIT_REASON);
1094 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_TOTAL]++;
1097 case EXIT_REASON_EXCEPTION_NMI:
1098 asm volatile("int %0" : : "i" (NMI_VECTOR));
1100 case EXIT_REASON_PREEMPTION_TIMER:
1101 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_MANAGEMENT]++;
1102 vmx_disable_preemption_timer();
1103 sipi_vector = x86_handle_events(cpu_data);
1104 if (sipi_vector >= 0) {
1105 printk("CPU %d received SIPI, vector %x\n",
1106 cpu_data->cpu_id, sipi_vector);
1107 vmx_cpu_reset(cpu_data, sipi_vector);
1108 memset(guest_regs, 0, sizeof(*guest_regs));
1110 vtd_check_pending_faults(cpu_data);
1112 case EXIT_REASON_CPUID:
1113 vmx_skip_emulated_instruction(X86_INST_LEN_CPUID);
1114 guest_regs->rax &= 0xffffffff;
1115 guest_regs->rbx &= 0xffffffff;
1116 guest_regs->rcx &= 0xffffffff;
1117 guest_regs->rdx &= 0xffffffff;
1118 __cpuid((u32 *)&guest_regs->rax, (u32 *)&guest_regs->rbx,
1119 (u32 *)&guest_regs->rcx, (u32 *)&guest_regs->rdx);
1121 case EXIT_REASON_VMCALL:
1122 vmx_handle_hypercall(guest_regs, cpu_data);
1124 case EXIT_REASON_CR_ACCESS:
1125 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_CR]++;
1126 if (vmx_handle_cr(guest_regs, cpu_data))
1129 case EXIT_REASON_MSR_READ:
1130 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_MSR]++;
1131 if (guest_regs->rcx >= MSR_X2APIC_BASE &&
1132 guest_regs->rcx <= MSR_X2APIC_END) {
1133 vmx_skip_emulated_instruction(X86_INST_LEN_RDMSR);
1134 x2apic_handle_read(guest_regs);
1137 panic_printk("FATAL: Unhandled MSR read: %08x\n",
1140 case EXIT_REASON_MSR_WRITE:
1141 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_MSR]++;
1142 if (guest_regs->rcx == MSR_X2APIC_ICR) {
1143 if (!apic_handle_icr_write(cpu_data, guest_regs->rax,
1146 vmx_skip_emulated_instruction(X86_INST_LEN_WRMSR);
1149 if (guest_regs->rcx >= MSR_X2APIC_BASE &&
1150 guest_regs->rcx <= MSR_X2APIC_END) {
1151 x2apic_handle_write(guest_regs);
1152 vmx_skip_emulated_instruction(X86_INST_LEN_WRMSR);
1155 panic_printk("FATAL: Unhandled MSR write: %08x\n",
1158 case EXIT_REASON_APIC_ACCESS:
1159 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_XAPIC]++;
1160 if (vmx_handle_apic_access(guest_regs, cpu_data))
1163 case EXIT_REASON_XSETBV:
1164 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_XSETBV]++;
1165 if (guest_regs->rax & X86_XCR0_FP &&
1166 (guest_regs->rax & ~cpuid_eax(0x0d)) == 0 &&
1167 guest_regs->rcx == 0 && guest_regs->rdx == 0) {
1168 vmx_skip_emulated_instruction(X86_INST_LEN_XSETBV);
1172 : "a" (guest_regs->rax), "c" (0), "d" (0));
1175 panic_printk("FATAL: Invalid xsetbv parameters: "
1176 "xcr[%d] = %08x:%08x\n", guest_regs->rcx,
1177 guest_regs->rdx, guest_regs->rax);
1179 case EXIT_REASON_IO_INSTRUCTION:
1180 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_PIO]++;
1181 if (vmx_handle_io_access(guest_regs, cpu_data))
1184 case EXIT_REASON_EPT_VIOLATION:
1185 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_MMIO]++;
1186 if (vmx_handle_ept_violation(guest_regs, cpu_data))
1190 panic_printk("FATAL: %s, reason %d\n",
1191 (reason & EXIT_REASONS_FAILED_VMENTRY) ?
1192 "VM-Entry failure" : "Unhandled VM-Exit",
1194 dump_vm_exit_details(reason);
1197 dump_guest_regs(guest_regs);
1198 panic_halt(cpu_data);
1201 void vmx_entry_failure(struct per_cpu *cpu_data)
1203 panic_printk("FATAL: vmresume failed, error %d\n",
1204 vmcs_read32(VM_INSTRUCTION_ERROR));
1205 panic_stop(cpu_data);