2 * Jailhouse, a Linux-based partitioning hypervisor
4 * Copyright (c) Siemens AG, 2013
7 * Jan Kiszka <jan.kiszka@siemens.com>
9 * This work is licensed under the terms of the GNU GPL, version 2. See
10 * the COPYING file in the top-level directory.
13 #include <jailhouse/entry.h>
14 #include <jailhouse/paging.h>
15 #include <jailhouse/processor.h>
16 #include <jailhouse/printk.h>
17 #include <jailhouse/string.h>
18 #include <jailhouse/control.h>
19 #include <jailhouse/hypercall.h>
20 #include <jailhouse/mmio.h>
21 #include <jailhouse/pci.h>
23 #include <asm/control.h>
25 #include <asm/ioapic.h>
30 static const struct segment invalid_seg = {
31 .access_rights = 0x10000
34 static u8 __attribute__((aligned(PAGE_SIZE))) msr_bitmap[][0x2000/8] = {
35 [ VMX_MSR_BMP_0000_READ ] = {
36 [ 0/8 ... 0x7ff/8 ] = 0,
37 [ 0x800/8 ... 0x807/8 ] = 0x0c, /* 0x802, 0x803 */
38 [ 0x808/8 ... 0x80f/8 ] = 0xa5, /* 0x808, 0x80a, 0x80d */
39 [ 0x810/8 ... 0x817/8 ] = 0xff, /* 0x810 - 0x817 */
40 [ 0x818/8 ... 0x81f/8 ] = 0xff, /* 0x818 - 0x81f */
41 [ 0x820/8 ... 0x827/8 ] = 0xff, /* 0x820 - 0x827 */
42 [ 0x828/8 ... 0x82f/8 ] = 0x81, /* 0x828, 0x82f */
43 [ 0x830/8 ... 0x837/8 ] = 0xfd, /* 0x830, 0x832 - 0x837 */
44 [ 0x838/8 ... 0x83f/8 ] = 0x43, /* 0x838, 0x839, 0x83e */
45 [ 0x840/8 ... 0x1fff/8 ] = 0,
47 [ VMX_MSR_BMP_C000_READ ] = {
48 [ 0/8 ... 0x1fff/8 ] = 0,
50 [ VMX_MSR_BMP_0000_WRITE ] = {
51 [ 0/8 ... 0x807/8 ] = 0,
52 [ 0x808/8 ... 0x80f/8 ] = 0x89, /* 0x808, 0x80b, 0x80f */
53 [ 0x810/8 ... 0x827/8 ] = 0,
54 [ 0x828/8 ... 0x82f/8 ] = 0x81, /* 0x828, 0x82f */
55 [ 0x830/8 ... 0x837/8 ] = 0xfd, /* 0x830, 0x832 - 0x837 */
56 [ 0x838/8 ... 0x83f/8 ] = 0xc1, /* 0x838, 0x83e, 0x83f */
57 [ 0x840/8 ... 0x1fff/8 ] = 0,
59 [ VMX_MSR_BMP_C000_WRITE ] = {
60 [ 0/8 ... 0x1fff/8 ] = 0,
63 static u8 __attribute__((aligned(PAGE_SIZE))) apic_access_page[PAGE_SIZE];
64 static struct paging ept_paging[EPT_PAGE_DIR_LEVELS];
66 static bool vmxon(struct per_cpu *cpu_data)
68 unsigned long vmxon_addr;
71 vmxon_addr = page_map_hvirt2phys(&cpu_data->vmxon_region);
76 : "r" (&vmxon_addr), "m" (vmxon_addr)
81 static bool vmcs_clear(struct per_cpu *cpu_data)
83 unsigned long vmcs_addr = page_map_hvirt2phys(&cpu_data->vmcs);
90 : "r" (&vmcs_addr), "m" (vmcs_addr)
95 static bool vmcs_load(struct per_cpu *cpu_data)
97 unsigned long vmcs_addr = page_map_hvirt2phys(&cpu_data->vmcs);
104 : "r" (&vmcs_addr), "m" (vmcs_addr)
109 static inline unsigned long vmcs_read64(unsigned long field)
113 asm volatile("vmread %1,%0" : "=r" (value) : "r" (field) : "cc");
117 static inline u16 vmcs_read16(unsigned long field)
119 return vmcs_read64(field);
122 static inline u32 vmcs_read32(unsigned long field)
124 return vmcs_read64(field);
127 static bool vmcs_write64(unsigned long field, unsigned long val)
135 : "r" (val), "r" (field)
138 printk("FATAL: vmwrite %08lx failed, error %d, caller %p\n",
139 field, vmcs_read32(VM_INSTRUCTION_ERROR),
140 __builtin_return_address(0));
144 static bool vmcs_write16(unsigned long field, u16 value)
146 return vmcs_write64(field, value);
149 static bool vmcs_write32(unsigned long field, u32 value)
151 return vmcs_write64(field, value);
154 static int vmx_check_features(void)
156 unsigned long vmx_proc_ctrl, vmx_proc_ctrl2, ept_cap;
157 unsigned long vmx_pin_ctrl, vmx_basic;
159 if (!(cpuid_ecx(1) & X86_FEATURE_VMX))
162 vmx_basic = read_msr(MSR_IA32_VMX_BASIC);
164 /* require VMCS size <= PAGE_SIZE,
165 * VMCS memory access type == write back and
166 * availability of TRUE_*_CTLS */
167 if (((vmx_basic >> 32) & 0x1fff) > PAGE_SIZE ||
168 ((vmx_basic >> 50) & 0xf) != EPT_TYPE_WRITEBACK ||
169 !(vmx_basic & (1UL << 55)))
172 /* require NMI exiting and preemption timer support */
173 vmx_pin_ctrl = read_msr(MSR_IA32_VMX_PINBASED_CTLS) >> 32;
174 if (!(vmx_pin_ctrl & PIN_BASED_NMI_EXITING) ||
175 !(vmx_pin_ctrl & PIN_BASED_VMX_PREEMPTION_TIMER))
178 /* require I/O and MSR bitmap as well as secondary controls support */
179 vmx_proc_ctrl = read_msr(MSR_IA32_VMX_PROCBASED_CTLS) >> 32;
180 if (!(vmx_proc_ctrl & CPU_BASED_USE_IO_BITMAPS) ||
181 !(vmx_proc_ctrl & CPU_BASED_USE_MSR_BITMAPS) ||
182 !(vmx_proc_ctrl & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
185 /* require disabling of CR3 access interception */
186 vmx_proc_ctrl = read_msr(MSR_IA32_VMX_TRUE_PROCBASED_CTLS);
188 (CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING))
191 /* require APIC access, EPT and unrestricted guest mode support */
192 vmx_proc_ctrl2 = read_msr(MSR_IA32_VMX_PROCBASED_CTLS2) >> 32;
193 ept_cap = read_msr(MSR_IA32_VMX_EPT_VPID_CAP);
194 if (!(vmx_proc_ctrl2 & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) ||
195 !(vmx_proc_ctrl2 & SECONDARY_EXEC_ENABLE_EPT) ||
196 (ept_cap & EPT_MANDATORY_FEATURES) != EPT_MANDATORY_FEATURES ||
197 !(ept_cap & (EPT_INVEPT_SINGLE | EPT_INVEPT_GLOBAL)) ||
198 !(vmx_proc_ctrl2 & SECONDARY_EXEC_UNRESTRICTED_GUEST))
201 /* require activity state HLT */
202 if (!(read_msr(MSR_IA32_VMX_MISC) & VMX_MISC_ACTIVITY_HLT))
208 static void ept_set_next_pt(pt_entry_t pte, unsigned long next_pt)
210 *pte = (next_pt & 0x000ffffffffff000UL) | EPT_FLAG_READ |
211 EPT_FLAG_WRITE | EPT_FLAG_EXECUTE;
219 err = vmx_check_features();
223 /* derive ept_paging from very similar x86_64_paging */
224 memcpy(ept_paging, x86_64_paging, sizeof(ept_paging));
225 for (n = 0; n < EPT_PAGE_DIR_LEVELS; n++)
226 ept_paging[n].set_next_pt = ept_set_next_pt;
227 if (!(read_msr(MSR_IA32_VMX_EPT_VPID_CAP) & EPT_1G_PAGES))
228 ept_paging[1].page_size = 0;
229 if (!(read_msr(MSR_IA32_VMX_EPT_VPID_CAP) & EPT_2M_PAGES))
230 ept_paging[2].page_size = 0;
233 /* allow direct x2APIC access except for ICR writes */
234 memset(&msr_bitmap[VMX_MSR_BMP_0000_READ][MSR_X2APIC_BASE/8],
235 0, (MSR_X2APIC_END - MSR_X2APIC_BASE + 1)/8);
236 memset(&msr_bitmap[VMX_MSR_BMP_0000_WRITE][MSR_X2APIC_BASE/8],
237 0, (MSR_X2APIC_END - MSR_X2APIC_BASE + 1)/8);
238 msr_bitmap[VMX_MSR_BMP_0000_WRITE][MSR_X2APIC_ICR/8] = 0x01;
241 return vmx_cell_init(&root_cell);
244 unsigned long arch_page_map_gphys2phys(struct per_cpu *cpu_data,
247 return page_map_virt2phys(&cpu_data->cell->vmx.ept_structs, gphys);
250 int vmx_cell_init(struct cell *cell)
252 const u8 *pio_bitmap = jailhouse_cell_pio_bitmap(cell->config);
253 u32 pio_bitmap_size = cell->config->pio_bitmap_size;
258 /* build root EPT of cell */
259 cell->vmx.ept_structs.root_paging = ept_paging;
260 cell->vmx.ept_structs.root_table = page_alloc(&mem_pool, 1);
261 if (!cell->vmx.ept_structs.root_table)
264 err = page_map_create(&cell->vmx.ept_structs,
265 page_map_hvirt2phys(apic_access_page),
266 PAGE_SIZE, XAPIC_BASE,
267 EPT_FLAG_READ|EPT_FLAG_WRITE|EPT_FLAG_WB_TYPE,
268 PAGE_MAP_NON_COHERENT);
274 memset(cell->vmx.io_bitmap, -1, sizeof(cell->vmx.io_bitmap));
276 for (n = 0; n < 2; n++) {
277 size = pio_bitmap_size <= PAGE_SIZE ?
278 pio_bitmap_size : PAGE_SIZE;
279 memcpy(cell->vmx.io_bitmap + n * PAGE_SIZE, pio_bitmap, size);
281 pio_bitmap_size -= size;
284 if (cell != &root_cell) {
286 * Shrink PIO access of root cell corresponding to new cell's
289 pio_bitmap = jailhouse_cell_pio_bitmap(cell->config);
290 pio_bitmap_size = cell->config->pio_bitmap_size;
291 for (b = root_cell.vmx.io_bitmap; pio_bitmap_size > 0;
292 b++, pio_bitmap++, pio_bitmap_size--)
301 int vmx_map_memory_region(struct cell *cell,
302 const struct jailhouse_memory *mem)
304 u64 phys_start = mem->phys_start;
305 u32 flags = EPT_FLAG_WB_TYPE;
307 if (mem->flags & JAILHOUSE_MEM_READ)
308 flags |= EPT_FLAG_READ;
309 if (mem->flags & JAILHOUSE_MEM_WRITE)
310 flags |= EPT_FLAG_WRITE;
311 if (mem->flags & JAILHOUSE_MEM_EXECUTE)
312 flags |= EPT_FLAG_EXECUTE;
313 if (mem->flags & JAILHOUSE_MEM_COMM_REGION)
314 phys_start = page_map_hvirt2phys(&cell->comm_page);
316 return page_map_create(&cell->vmx.ept_structs, phys_start, mem->size,
317 mem->virt_start, flags, PAGE_MAP_NON_COHERENT);
320 int vmx_unmap_memory_region(struct cell *cell,
321 const struct jailhouse_memory *mem)
323 return page_map_destroy(&cell->vmx.ept_structs, mem->virt_start,
324 mem->size, PAGE_MAP_NON_COHERENT);
327 void vmx_cell_exit(struct cell *cell)
329 const u8 *root_pio_bitmap =
330 jailhouse_cell_pio_bitmap(root_cell.config);
331 const u8 *pio_bitmap = jailhouse_cell_pio_bitmap(cell->config);
332 u32 pio_bitmap_size = cell->config->pio_bitmap_size;
335 page_map_destroy(&cell->vmx.ept_structs, XAPIC_BASE, PAGE_SIZE,
336 PAGE_MAP_NON_COHERENT);
338 if (root_cell.config->pio_bitmap_size < pio_bitmap_size)
339 pio_bitmap_size = root_cell.config->pio_bitmap_size;
341 for (b = root_cell.vmx.io_bitmap; pio_bitmap_size > 0;
342 b++, pio_bitmap++, root_pio_bitmap++, pio_bitmap_size--)
343 *b &= *pio_bitmap | *root_pio_bitmap;
345 page_free(&mem_pool, cell->vmx.ept_structs.root_table, 1);
348 void vmx_invept(void)
350 unsigned long ept_cap = read_msr(MSR_IA32_VMX_EPT_VPID_CAP);
358 descriptor.reserved = 0;
359 if (ept_cap & EPT_INVEPT_SINGLE) {
360 type = VMX_INVEPT_SINGLE;
361 descriptor.eptp = vmcs_read64(EPT_POINTER);
363 type = VMX_INVEPT_GLOBAL;
370 : "r" (&descriptor), "r" (type)
374 panic_printk("FATAL: invept failed, error %d\n",
375 vmcs_read32(VM_INSTRUCTION_ERROR));
380 static bool vmx_set_guest_cr(int cr, unsigned long val)
382 unsigned long fixed0, fixed1, required1;
385 fixed0 = read_msr(cr ? MSR_IA32_VMX_CR4_FIXED0
386 : MSR_IA32_VMX_CR0_FIXED0);
387 fixed1 = read_msr(cr ? MSR_IA32_VMX_CR4_FIXED1
388 : MSR_IA32_VMX_CR0_FIXED1);
389 required1 = fixed0 & fixed1;
391 fixed1 &= ~(X86_CR0_NW | X86_CR0_CD);
392 required1 &= ~(X86_CR0_PE | X86_CR0_PG);
393 required1 |= X86_CR0_ET;
395 /* keeps the hypervisor visible */
398 ok &= vmcs_write64(cr ? GUEST_CR4 : GUEST_CR0,
399 (val & fixed1) | required1);
400 ok &= vmcs_write64(cr ? CR4_READ_SHADOW : CR0_READ_SHADOW, val);
401 ok &= vmcs_write64(cr ? CR4_GUEST_HOST_MASK : CR0_GUEST_HOST_MASK,
402 required1 | ~fixed1);
407 static bool vmx_set_cell_config(struct cell *cell)
412 io_bitmap = cell->vmx.io_bitmap;
413 ok &= vmcs_write64(IO_BITMAP_A, page_map_hvirt2phys(io_bitmap));
414 ok &= vmcs_write64(IO_BITMAP_B,
415 page_map_hvirt2phys(io_bitmap + PAGE_SIZE));
417 ok &= vmcs_write64(EPT_POINTER,
418 page_map_hvirt2phys(cell->vmx.ept_structs.root_table) |
419 EPT_TYPE_WRITEBACK | EPT_PAGE_WALK_LEN);
424 static bool vmx_set_guest_segment(const struct segment *seg,
425 unsigned long selector_field)
429 ok &= vmcs_write16(selector_field, seg->selector);
430 ok &= vmcs_write64(selector_field + GUEST_SEG_BASE, seg->base);
431 ok &= vmcs_write32(selector_field + GUEST_SEG_LIMIT, seg->limit);
432 ok &= vmcs_write32(selector_field + GUEST_SEG_AR_BYTES,
437 static bool vmcs_setup(struct per_cpu *cpu_data)
439 struct desc_table_reg dtr;
443 ok &= vmcs_write64(HOST_CR0, read_cr0());
444 ok &= vmcs_write64(HOST_CR3, read_cr3());
445 ok &= vmcs_write64(HOST_CR4, read_cr4());
447 ok &= vmcs_write16(HOST_CS_SELECTOR, GDT_DESC_CODE * 8);
448 ok &= vmcs_write16(HOST_DS_SELECTOR, 0);
449 ok &= vmcs_write16(HOST_ES_SELECTOR, 0);
450 ok &= vmcs_write16(HOST_SS_SELECTOR, 0);
451 ok &= vmcs_write16(HOST_FS_SELECTOR, 0);
452 ok &= vmcs_write16(HOST_GS_SELECTOR, 0);
453 ok &= vmcs_write16(HOST_TR_SELECTOR, GDT_DESC_TSS * 8);
455 ok &= vmcs_write64(HOST_FS_BASE, 0);
456 ok &= vmcs_write64(HOST_GS_BASE, 0);
457 ok &= vmcs_write64(HOST_TR_BASE, 0);
460 ok &= vmcs_write64(HOST_GDTR_BASE, dtr.base);
462 ok &= vmcs_write64(HOST_IDTR_BASE, dtr.base);
464 ok &= vmcs_write64(HOST_IA32_EFER, EFER_LMA | EFER_LME);
466 ok &= vmcs_write32(HOST_IA32_SYSENTER_CS, 0);
467 ok &= vmcs_write64(HOST_IA32_SYSENTER_EIP, 0);
468 ok &= vmcs_write64(HOST_IA32_SYSENTER_ESP, 0);
470 ok &= vmcs_write64(HOST_RSP, (unsigned long)cpu_data->stack +
471 sizeof(cpu_data->stack));
472 ok &= vmcs_write64(HOST_RIP, (unsigned long)vm_exit);
474 ok &= vmx_set_guest_cr(0, read_cr0());
475 ok &= vmx_set_guest_cr(4, read_cr4());
477 ok &= vmcs_write64(GUEST_CR3, cpu_data->linux_cr3);
479 ok &= vmx_set_guest_segment(&cpu_data->linux_cs, GUEST_CS_SELECTOR);
480 ok &= vmx_set_guest_segment(&cpu_data->linux_ds, GUEST_DS_SELECTOR);
481 ok &= vmx_set_guest_segment(&cpu_data->linux_es, GUEST_ES_SELECTOR);
482 ok &= vmx_set_guest_segment(&cpu_data->linux_fs, GUEST_FS_SELECTOR);
483 ok &= vmx_set_guest_segment(&cpu_data->linux_gs, GUEST_GS_SELECTOR);
484 ok &= vmx_set_guest_segment(&invalid_seg, GUEST_SS_SELECTOR);
485 ok &= vmx_set_guest_segment(&cpu_data->linux_tss, GUEST_TR_SELECTOR);
486 ok &= vmx_set_guest_segment(&invalid_seg, GUEST_LDTR_SELECTOR);
488 ok &= vmcs_write64(GUEST_GDTR_BASE, cpu_data->linux_gdtr.base);
489 ok &= vmcs_write32(GUEST_GDTR_LIMIT, cpu_data->linux_gdtr.limit);
490 ok &= vmcs_write64(GUEST_IDTR_BASE, cpu_data->linux_idtr.base);
491 ok &= vmcs_write32(GUEST_IDTR_LIMIT, cpu_data->linux_idtr.limit);
493 ok &= vmcs_write64(GUEST_RFLAGS, 0x02);
494 ok &= vmcs_write64(GUEST_RSP, cpu_data->linux_sp +
495 (NUM_ENTRY_REGS + 1) * sizeof(unsigned long));
496 ok &= vmcs_write64(GUEST_RIP, cpu_data->linux_ip);
498 ok &= vmcs_write32(GUEST_SYSENTER_CS,
499 read_msr(MSR_IA32_SYSENTER_CS));
500 ok &= vmcs_write64(GUEST_SYSENTER_EIP,
501 read_msr(MSR_IA32_SYSENTER_EIP));
502 ok &= vmcs_write64(GUEST_SYSENTER_ESP,
503 read_msr(MSR_IA32_SYSENTER_ESP));
505 ok &= vmcs_write64(GUEST_DR7, 0x00000400);
506 ok &= vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
508 ok &= vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
509 ok &= vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
510 ok &= vmcs_write64(GUEST_PENDING_DBG_EXCEPTIONS, 0);
512 ok &= vmcs_write64(GUEST_IA32_EFER, cpu_data->linux_efer);
514 // TODO: switch PAT, PERF */
516 ok &= vmcs_write64(VMCS_LINK_POINTER, -1UL);
517 ok &= vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
519 val = read_msr(MSR_IA32_VMX_PINBASED_CTLS);
520 val |= PIN_BASED_NMI_EXITING;
521 ok &= vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, val);
523 ok &= vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0);
525 val = read_msr(MSR_IA32_VMX_PROCBASED_CTLS);
526 val |= CPU_BASED_USE_IO_BITMAPS | CPU_BASED_USE_MSR_BITMAPS |
527 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
528 val &= ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
529 ok &= vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, val);
531 ok &= vmcs_write64(MSR_BITMAP, page_map_hvirt2phys(msr_bitmap));
533 val = read_msr(MSR_IA32_VMX_PROCBASED_CTLS2);
534 val |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
535 SECONDARY_EXEC_ENABLE_EPT | SECONDARY_EXEC_UNRESTRICTED_GUEST;
536 ok &= vmcs_write32(SECONDARY_VM_EXEC_CONTROL, val);
538 ok &= vmcs_write64(APIC_ACCESS_ADDR,
539 page_map_hvirt2phys(apic_access_page));
541 ok &= vmx_set_cell_config(cpu_data->cell);
543 ok &= vmcs_write32(EXCEPTION_BITMAP, 0);
545 val = read_msr(MSR_IA32_VMX_EXIT_CTLS);
546 val |= VM_EXIT_HOST_ADDR_SPACE_SIZE | VM_EXIT_SAVE_IA32_EFER |
547 VM_EXIT_LOAD_IA32_EFER;
548 ok &= vmcs_write32(VM_EXIT_CONTROLS, val);
550 ok &= vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
551 ok &= vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
552 ok &= vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
554 val = read_msr(MSR_IA32_VMX_ENTRY_CTLS);
555 val |= VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER;
556 ok &= vmcs_write32(VM_ENTRY_CONTROLS, val);
558 ok &= vmcs_write64(CR4_GUEST_HOST_MASK, 0);
560 ok &= vmcs_write32(CR3_TARGET_COUNT, 0);
565 int vmx_cpu_init(struct per_cpu *cpu_data)
567 unsigned long cr4, feature_ctrl, mask;
572 if (cr4 & X86_CR4_VMXE)
575 err = vmx_check_features();
579 revision_id = (u32)read_msr(MSR_IA32_VMX_BASIC);
580 cpu_data->vmxon_region.revision_id = revision_id;
581 cpu_data->vmxon_region.shadow_indicator = 0;
582 cpu_data->vmcs.revision_id = revision_id;
583 cpu_data->vmcs.shadow_indicator = 0;
585 // TODO: validate CR0
587 /* Note: We assume that TXT is off */
588 feature_ctrl = read_msr(MSR_IA32_FEATURE_CONTROL);
589 mask = FEATURE_CONTROL_LOCKED |
590 FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
592 if ((feature_ctrl & mask) != mask) {
593 if (feature_ctrl & FEATURE_CONTROL_LOCKED)
596 feature_ctrl |= mask;
597 write_msr(MSR_IA32_FEATURE_CONTROL, feature_ctrl);
600 write_cr4(cr4 | X86_CR4_VMXE);
601 // TODO: validate CR4
603 if (!vmxon(cpu_data)) {
608 cpu_data->vmx_state = VMXON;
610 if (!vmcs_clear(cpu_data) ||
611 !vmcs_load(cpu_data) ||
612 !vmcs_setup(cpu_data))
615 cpu_data->vmx_state = VMCS_READY;
620 void vmx_cpu_exit(struct per_cpu *cpu_data)
622 if (cpu_data->vmx_state == VMXOFF)
625 cpu_data->vmx_state = VMXOFF;
626 /* Write vmx_state to ensure that vmx_schedule_vmexit stops accessing
627 * the VMCS (a compiler barrier would be sufficient, in fact). */
630 vmcs_clear(cpu_data);
631 asm volatile("vmxoff" : : : "cc");
632 write_cr4(read_cr4() & ~X86_CR4_VMXE);
635 void vmx_cpu_activate_vmm(struct per_cpu *cpu_data)
637 /* We enter Linux at the point arch_entry would return to as well.
638 * rax is cleared to signal success to the caller. */
640 "mov (%%rdi),%%r15\n\t"
641 "mov 0x8(%%rdi),%%r14\n\t"
642 "mov 0x10(%%rdi),%%r13\n\t"
643 "mov 0x18(%%rdi),%%r12\n\t"
644 "mov 0x20(%%rdi),%%rbx\n\t"
645 "mov 0x28(%%rdi),%%rbp\n\t"
649 : "a" (0), "D" (cpu_data->linux_reg)
650 : "memory", "r15", "r14", "r13", "r12", "rbx", "rbp", "cc");
652 panic_printk("FATAL: vmlaunch failed, error %d\n",
653 vmcs_read32(VM_INSTRUCTION_ERROR));
654 panic_stop(cpu_data);
657 static void __attribute__((noreturn))
658 vmx_cpu_deactivate_vmm(struct registers *guest_regs, struct per_cpu *cpu_data)
660 unsigned long *stack = (unsigned long *)vmcs_read64(GUEST_RSP);
661 unsigned long linux_ip = vmcs_read64(GUEST_RIP);
663 cpu_data->linux_cr3 = vmcs_read64(GUEST_CR3);
665 cpu_data->linux_gdtr.base = vmcs_read64(GUEST_GDTR_BASE);
666 cpu_data->linux_gdtr.limit = vmcs_read64(GUEST_GDTR_LIMIT);
667 cpu_data->linux_idtr.base = vmcs_read64(GUEST_IDTR_BASE);
668 cpu_data->linux_idtr.limit = vmcs_read64(GUEST_IDTR_LIMIT);
670 cpu_data->linux_cs.selector = vmcs_read32(GUEST_CS_SELECTOR);
672 cpu_data->linux_tss.selector = vmcs_read32(GUEST_TR_SELECTOR);
674 cpu_data->linux_efer = vmcs_read64(GUEST_IA32_EFER);
675 cpu_data->linux_fs.base = vmcs_read64(GUEST_FS_BASE);
676 cpu_data->linux_gs.base = vmcs_read64(GUEST_GS_BASE);
678 cpu_data->linux_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
679 cpu_data->linux_sysenter_eip = vmcs_read64(GUEST_SYSENTER_EIP);
680 cpu_data->linux_sysenter_esp = vmcs_read64(GUEST_SYSENTER_ESP);
682 cpu_data->linux_ds.selector = vmcs_read16(GUEST_DS_SELECTOR);
683 cpu_data->linux_es.selector = vmcs_read16(GUEST_ES_SELECTOR);
684 cpu_data->linux_fs.selector = vmcs_read16(GUEST_FS_SELECTOR);
685 cpu_data->linux_gs.selector = vmcs_read16(GUEST_GS_SELECTOR);
687 arch_cpu_restore(cpu_data);
693 "mov %%rbx,%%rsp\n\t"
709 "mov %%rax,%%rsp\n\t"
710 "xor %%rax,%%rax\n\t"
712 : : "a" (stack), "b" (guest_regs));
713 __builtin_unreachable();
716 static void vmx_cpu_reset(struct per_cpu *cpu_data, unsigned int sipi_vector)
721 ok &= vmx_set_guest_cr(0, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
722 ok &= vmx_set_guest_cr(4, 0);
724 ok &= vmcs_write64(GUEST_CR3, 0);
726 ok &= vmcs_write64(GUEST_RFLAGS, 0x02);
727 ok &= vmcs_write64(GUEST_RSP, 0);
730 if (sipi_vector == APIC_BSP_PSEUDO_SIPI) {
734 ok &= vmcs_write64(GUEST_RIP, val);
736 ok &= vmcs_write16(GUEST_CS_SELECTOR, sipi_vector << 8);
737 ok &= vmcs_write64(GUEST_CS_BASE, sipi_vector << 12);
738 ok &= vmcs_write32(GUEST_CS_LIMIT, 0xffff);
739 ok &= vmcs_write32(GUEST_CS_AR_BYTES, 0x0009b);
741 ok &= vmcs_write16(GUEST_DS_SELECTOR, 0);
742 ok &= vmcs_write64(GUEST_DS_BASE, 0);
743 ok &= vmcs_write32(GUEST_DS_LIMIT, 0xffff);
744 ok &= vmcs_write32(GUEST_DS_AR_BYTES, 0x00093);
746 ok &= vmcs_write16(GUEST_ES_SELECTOR, 0);
747 ok &= vmcs_write64(GUEST_ES_BASE, 0);
748 ok &= vmcs_write32(GUEST_ES_LIMIT, 0xffff);
749 ok &= vmcs_write32(GUEST_ES_AR_BYTES, 0x00093);
751 ok &= vmcs_write16(GUEST_FS_SELECTOR, 0);
752 ok &= vmcs_write64(GUEST_FS_BASE, 0);
753 ok &= vmcs_write32(GUEST_FS_LIMIT, 0xffff);
754 ok &= vmcs_write32(GUEST_FS_AR_BYTES, 0x00093);
756 ok &= vmcs_write16(GUEST_GS_SELECTOR, 0);
757 ok &= vmcs_write64(GUEST_GS_BASE, 0);
758 ok &= vmcs_write32(GUEST_GS_LIMIT, 0xffff);
759 ok &= vmcs_write32(GUEST_GS_AR_BYTES, 0x00093);
761 ok &= vmcs_write16(GUEST_SS_SELECTOR, 0);
762 ok &= vmcs_write64(GUEST_SS_BASE, 0);
763 ok &= vmcs_write32(GUEST_SS_LIMIT, 0xffff);
764 ok &= vmcs_write32(GUEST_SS_AR_BYTES, 0x00093);
766 ok &= vmcs_write16(GUEST_TR_SELECTOR, 0);
767 ok &= vmcs_write64(GUEST_TR_BASE, 0);
768 ok &= vmcs_write32(GUEST_TR_LIMIT, 0xffff);
769 ok &= vmcs_write32(GUEST_TR_AR_BYTES, 0x0008b);
771 ok &= vmcs_write16(GUEST_LDTR_SELECTOR, 0);
772 ok &= vmcs_write64(GUEST_LDTR_BASE, 0);
773 ok &= vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
774 ok &= vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
776 ok &= vmcs_write64(GUEST_GDTR_BASE, 0);
777 ok &= vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
778 ok &= vmcs_write64(GUEST_IDTR_BASE, 0);
779 ok &= vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
781 ok &= vmcs_write64(GUEST_IA32_EFER, 0);
783 ok &= vmcs_write32(GUEST_SYSENTER_CS, 0);
784 ok &= vmcs_write64(GUEST_SYSENTER_EIP, 0);
785 ok &= vmcs_write64(GUEST_SYSENTER_ESP, 0);
787 ok &= vmcs_write64(GUEST_DR7, 0x00000400);
788 ok &= vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
790 ok &= vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
791 ok &= vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
792 ok &= vmcs_write64(GUEST_PENDING_DBG_EXCEPTIONS, 0);
794 val = vmcs_read32(VM_ENTRY_CONTROLS);
795 val &= ~VM_ENTRY_IA32E_MODE;
796 ok &= vmcs_write32(VM_ENTRY_CONTROLS, val);
798 ok &= vmx_set_cell_config(cpu_data->cell);
801 panic_printk("FATAL: CPU reset failed\n");
802 panic_stop(cpu_data);
806 void vmx_schedule_vmexit(struct per_cpu *cpu_data)
810 if (cpu_data->vmx_state != VMCS_READY)
813 pin_based_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
814 pin_based_ctrl |= PIN_BASED_VMX_PREEMPTION_TIMER;
815 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, pin_based_ctrl);
818 void vmx_cpu_park(struct per_cpu *cpu_data)
820 vmx_cpu_reset(cpu_data, 0);
821 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_HLT);
824 static void vmx_disable_preemption_timer(void)
826 u32 pin_based_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
828 pin_based_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
829 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, pin_based_ctrl);
832 static void vmx_skip_emulated_instruction(unsigned int inst_len)
834 vmcs_write64(GUEST_RIP, vmcs_read64(GUEST_RIP) + inst_len);
837 static void update_efer(void)
839 unsigned long efer = vmcs_read64(GUEST_IA32_EFER);
841 if ((efer & (EFER_LME | EFER_LMA)) != EFER_LME)
845 vmcs_write64(GUEST_IA32_EFER, efer);
846 vmcs_write32(VM_ENTRY_CONTROLS,
847 vmcs_read32(VM_ENTRY_CONTROLS) | VM_ENTRY_IA32E_MODE);
850 static void vmx_handle_hypercall(struct registers *guest_regs,
851 struct per_cpu *cpu_data)
853 unsigned long code = guest_regs->rax;
855 vmx_skip_emulated_instruction(X86_INST_LEN_VMCALL);
857 if ((!(vmcs_read64(GUEST_IA32_EFER) & EFER_LMA) &&
858 vmcs_read64(GUEST_RFLAGS) & X86_RFLAGS_VM) ||
859 (vmcs_read16(GUEST_CS_SELECTOR) & 3) != 0) {
860 guest_regs->rax = -EPERM;
864 guest_regs->rax = hypercall(cpu_data, code, guest_regs->rdi,
866 if (guest_regs->rax == -ENOSYS)
867 printk("CPU %d: Unknown vmcall %d, RIP: %p\n",
868 cpu_data->cpu_id, code,
869 vmcs_read64(GUEST_RIP) - X86_INST_LEN_VMCALL);
871 if (code == JAILHOUSE_HC_DISABLE && guest_regs->rax == 0)
872 vmx_cpu_deactivate_vmm(guest_regs, cpu_data);
875 static bool vmx_handle_cr(struct registers *guest_regs,
876 struct per_cpu *cpu_data)
878 u64 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
879 unsigned long cr, reg, val;
881 cr = exit_qualification & 0xf;
882 reg = (exit_qualification >> 8) & 0xf;
884 switch ((exit_qualification >> 4) & 3) {
885 case 0: /* move to cr */
887 val = vmcs_read64(GUEST_RSP);
889 val = ((unsigned long *)guest_regs)[15 - reg];
891 if (cr == 0 || cr == 4) {
892 vmx_skip_emulated_instruction(X86_INST_LEN_MOV_TO_CR);
893 /* TODO: check for #GP reasons */
894 vmx_set_guest_cr(cr, val);
895 if (cr == 0 && val & X86_CR0_PG)
903 panic_printk("FATAL: Unhandled CR access, qualification %x\n",
909 vmx_get_guest_paging_structs(struct guest_paging_structures *pg_structs)
911 if (vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE) {
912 pg_structs->root_paging = x86_64_paging;
913 pg_structs->root_table_gphys =
914 vmcs_read64(GUEST_CR3) & 0x000ffffffffff000UL;
915 } else if (vmcs_read64(GUEST_CR0) & X86_CR0_PG &&
916 !(vmcs_read64(GUEST_CR4) & X86_CR4_PAE)) {
917 pg_structs->root_paging = i386_paging;
918 pg_structs->root_table_gphys =
919 vmcs_read64(GUEST_CR3) & 0xfffff000UL;
921 printk("FATAL: Unsupported paging mode\n");
927 static bool vmx_handle_apic_access(struct registers *guest_regs,
928 struct per_cpu *cpu_data)
930 struct guest_paging_structures pg_structs;
931 unsigned int inst_len, offset;
935 qualification = vmcs_read64(EXIT_QUALIFICATION);
937 switch (qualification & APIC_ACCESS_TYPE_MASK) {
938 case APIC_ACCESS_TYPE_LINEAR_READ:
939 case APIC_ACCESS_TYPE_LINEAR_WRITE:
940 is_write = !!(qualification & APIC_ACCESS_TYPE_LINEAR_WRITE);
941 offset = qualification & APIC_ACCESS_OFFSET_MASK;
945 if (!vmx_get_guest_paging_structs(&pg_structs))
948 inst_len = apic_mmio_access(guest_regs, cpu_data,
949 vmcs_read64(GUEST_RIP),
950 &pg_structs, offset >> 4,
955 vmx_skip_emulated_instruction(inst_len);
958 panic_printk("FATAL: Unhandled APIC access, "
959 "qualification %x\n", qualification);
963 static void dump_vm_exit_details(u32 reason)
965 panic_printk("qualification %x\n", vmcs_read64(EXIT_QUALIFICATION));
966 panic_printk("vectoring info: %x interrupt info: %x\n",
967 vmcs_read32(IDT_VECTORING_INFO_FIELD),
968 vmcs_read32(VM_EXIT_INTR_INFO));
969 if (reason == EXIT_REASON_EPT_VIOLATION ||
970 reason == EXIT_REASON_EPT_MISCONFIG)
971 panic_printk("guest phys addr %p guest linear addr: %p\n",
972 vmcs_read64(GUEST_PHYSICAL_ADDRESS),
973 vmcs_read64(GUEST_LINEAR_ADDRESS));
976 static void dump_guest_regs(struct registers *guest_regs)
978 panic_printk("RIP: %p RSP: %p FLAGS: %x\n", vmcs_read64(GUEST_RIP),
979 vmcs_read64(GUEST_RSP), vmcs_read64(GUEST_RFLAGS));
980 panic_printk("RAX: %p RBX: %p RCX: %p\n", guest_regs->rax,
981 guest_regs->rbx, guest_regs->rcx);
982 panic_printk("RDX: %p RSI: %p RDI: %p\n", guest_regs->rdx,
983 guest_regs->rsi, guest_regs->rdi);
984 panic_printk("CS: %x BASE: %p AR-BYTES: %x EFER.LMA %d\n",
985 vmcs_read64(GUEST_CS_SELECTOR),
986 vmcs_read64(GUEST_CS_BASE),
987 vmcs_read32(GUEST_CS_AR_BYTES),
988 !!(vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE));
989 panic_printk("CR0: %p CR3: %p CR4: %p\n", vmcs_read64(GUEST_CR0),
990 vmcs_read64(GUEST_CR3), vmcs_read64(GUEST_CR4));
991 panic_printk("EFER: %p\n", vmcs_read64(GUEST_IA32_EFER));
994 static bool vmx_handle_io_access(struct registers *guest_regs,
995 struct per_cpu *cpu_data)
997 /* parse exit qualification for I/O instructions (see SDM, 27.2.1 ) */
998 u64 exitq = vmcs_read64(EXIT_QUALIFICATION);
999 u16 port = (exitq >> 16) & 0xFFFF;
1000 bool dir_in = (exitq & 0x8) >> 3;
1001 unsigned int size = (exitq & 0x3) + 1;
1003 /* string and REP-prefixed instructions are not supported */
1005 goto invalid_access;
1007 if (x86_pci_config_handler(guest_regs, cpu_data->cell, port, dir_in,
1009 vmx_skip_emulated_instruction(
1010 vmcs_read64(VM_EXIT_INSTRUCTION_LEN));
1015 panic_printk("FATAL: Invalid PIO %s, port: %x size: %d\n",
1016 dir_in ? "read" : "write", port, size);
1017 panic_printk("PCI address port: %x\n",
1018 cpu_data->cell->pci_addr_port_val);
1022 static bool vmx_handle_ept_violation(struct registers *guest_regs,
1023 struct per_cpu *cpu_data)
1025 u64 phys_addr = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
1026 u64 exitq = vmcs_read64(EXIT_QUALIFICATION);
1027 struct guest_paging_structures pg_structs;
1028 struct mmio_access access;
1033 /* We don't enable dirty/accessed bit updated in EPTP, so only read
1034 * of write flags can be set, not both. */
1035 is_write = !!(exitq & 0x2);
1037 if (!vmx_get_guest_paging_structs(&pg_structs))
1038 goto invalid_access;
1040 access = mmio_parse(cpu_data, vmcs_read64(GUEST_RIP),
1041 &pg_structs, is_write);
1042 if (!access.inst_len || access.size != 4)
1043 goto invalid_access;
1046 val = ((unsigned long *)guest_regs)[access.reg];
1048 result = ioapic_access_handler(cpu_data->cell, is_write, phys_addr,
1051 result = pci_mmio_access_handler(cpu_data->cell, is_write,
1056 ((unsigned long *)guest_regs)[access.reg] = val;
1057 vmx_skip_emulated_instruction(
1058 vmcs_read64(VM_EXIT_INSTRUCTION_LEN));
1063 /* report only unhandled access failures */
1065 panic_printk("FATAL: Invalid MMIO/RAM %s, addr: %p\n",
1066 is_write ? "write" : "read", phys_addr);
1070 void vmx_handle_exit(struct registers *guest_regs, struct per_cpu *cpu_data)
1072 u32 reason = vmcs_read32(VM_EXIT_REASON);
1075 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_TOTAL]++;
1078 case EXIT_REASON_EXCEPTION_NMI:
1079 asm volatile("int %0" : : "i" (NMI_VECTOR));
1081 case EXIT_REASON_PREEMPTION_TIMER:
1082 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_MANAGEMENT]++;
1083 vmx_disable_preemption_timer();
1084 sipi_vector = x86_handle_events(cpu_data);
1085 if (sipi_vector >= 0) {
1086 printk("CPU %d received SIPI, vector %x\n",
1087 cpu_data->cpu_id, sipi_vector);
1088 vmx_cpu_reset(cpu_data, sipi_vector);
1089 memset(guest_regs, 0, sizeof(*guest_regs));
1091 vtd_check_pending_faults(cpu_data);
1093 case EXIT_REASON_CPUID:
1094 vmx_skip_emulated_instruction(X86_INST_LEN_CPUID);
1095 guest_regs->rax &= 0xffffffff;
1096 guest_regs->rbx &= 0xffffffff;
1097 guest_regs->rcx &= 0xffffffff;
1098 guest_regs->rdx &= 0xffffffff;
1099 __cpuid((u32 *)&guest_regs->rax, (u32 *)&guest_regs->rbx,
1100 (u32 *)&guest_regs->rcx, (u32 *)&guest_regs->rdx);
1102 case EXIT_REASON_VMCALL:
1103 vmx_handle_hypercall(guest_regs, cpu_data);
1105 case EXIT_REASON_CR_ACCESS:
1106 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_CR]++;
1107 if (vmx_handle_cr(guest_regs, cpu_data))
1110 case EXIT_REASON_MSR_READ:
1111 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_MSR]++;
1112 if (guest_regs->rcx >= MSR_X2APIC_BASE &&
1113 guest_regs->rcx <= MSR_X2APIC_END) {
1114 vmx_skip_emulated_instruction(X86_INST_LEN_RDMSR);
1115 x2apic_handle_read(guest_regs);
1118 panic_printk("FATAL: Unhandled MSR read: %08x\n",
1121 case EXIT_REASON_MSR_WRITE:
1122 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_MSR]++;
1123 if (guest_regs->rcx == MSR_X2APIC_ICR) {
1124 if (!apic_handle_icr_write(cpu_data, guest_regs->rax,
1127 vmx_skip_emulated_instruction(X86_INST_LEN_WRMSR);
1130 if (guest_regs->rcx >= MSR_X2APIC_BASE &&
1131 guest_regs->rcx <= MSR_X2APIC_END) {
1132 x2apic_handle_write(guest_regs);
1133 vmx_skip_emulated_instruction(X86_INST_LEN_WRMSR);
1136 panic_printk("FATAL: Unhandled MSR write: %08x\n",
1139 case EXIT_REASON_APIC_ACCESS:
1140 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_XAPIC]++;
1141 if (vmx_handle_apic_access(guest_regs, cpu_data))
1144 case EXIT_REASON_XSETBV:
1145 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_XSETBV]++;
1146 if (guest_regs->rax & X86_XCR0_FP &&
1147 (guest_regs->rax & ~cpuid_eax(0x0d)) == 0 &&
1148 guest_regs->rcx == 0 && guest_regs->rdx == 0) {
1149 vmx_skip_emulated_instruction(X86_INST_LEN_XSETBV);
1153 : "a" (guest_regs->rax), "c" (0), "d" (0));
1156 panic_printk("FATAL: Invalid xsetbv parameters: "
1157 "xcr[%d] = %08x:%08x\n", guest_regs->rcx,
1158 guest_regs->rdx, guest_regs->rax);
1160 case EXIT_REASON_IO_INSTRUCTION:
1161 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_PIO]++;
1162 if (vmx_handle_io_access(guest_regs, cpu_data))
1165 case EXIT_REASON_EPT_VIOLATION:
1166 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_MMIO]++;
1167 if (vmx_handle_ept_violation(guest_regs, cpu_data))
1171 panic_printk("FATAL: %s, reason %d\n",
1172 (reason & EXIT_REASONS_FAILED_VMENTRY) ?
1173 "VM-Entry failure" : "Unhandled VM-Exit",
1175 dump_vm_exit_details(reason);
1178 dump_guest_regs(guest_regs);
1179 panic_halt(cpu_data);
1182 void vmx_entry_failure(struct per_cpu *cpu_data)
1184 panic_printk("FATAL: vmresume failed, error %d\n",
1185 vmcs_read32(VM_INSTRUCTION_ERROR));
1186 panic_stop(cpu_data);