2 * Jailhouse, a Linux-based partitioning hypervisor
4 * Copyright (c) Siemens AG, 2013-2015
5 * Copyright (c) Valentine Sinitsyn, 2014
8 * Jan Kiszka <jan.kiszka@siemens.com>
9 * Valentine Sinitsyn <valentine.sinitsyn@gmail.com>
11 * This work is licensed under the terms of the GNU GPL, version 2. See
12 * the COPYING file in the top-level directory.
15 #include <jailhouse/entry.h>
16 #include <jailhouse/paging.h>
17 #include <jailhouse/processor.h>
18 #include <jailhouse/printk.h>
19 #include <jailhouse/string.h>
20 #include <jailhouse/control.h>
21 #include <jailhouse/hypercall.h>
23 #include <asm/control.h>
24 #include <asm/iommu.h>
31 static const struct segment invalid_seg = {
32 .access_rights = 0x10000
35 /* bit cleared: direct access allowed */
36 // TODO: convert to whitelist
37 static u8 __attribute__((aligned(PAGE_SIZE))) msr_bitmap[][0x2000/8] = {
38 [ VMX_MSR_BMP_0000_READ ] = {
39 [ 0/8 ... 0x26f/8 ] = 0,
40 [ 0x270/8 ... 0x277/8 ] = 0x80, /* 0x277 */
41 [ 0x278/8 ... 0x2f7/8 ] = 0,
42 [ 0x2f8/8 ... 0x2ff/8 ] = 0x80, /* 0x2ff */
43 [ 0x300/8 ... 0x7ff/8 ] = 0,
44 [ 0x800/8 ... 0x807/8 ] = 0x0c, /* 0x802, 0x803 */
45 [ 0x808/8 ... 0x80f/8 ] = 0xa5, /* 0x808, 0x80a, 0x80d, 0x80f */
46 [ 0x810/8 ... 0x817/8 ] = 0xff, /* 0x810 - 0x817 */
47 [ 0x818/8 ... 0x81f/8 ] = 0xff, /* 0x818 - 0x81f */
48 [ 0x820/8 ... 0x827/8 ] = 0xff, /* 0x820 - 0x827 */
49 [ 0x828/8 ... 0x82f/8 ] = 0x81, /* 0x828, 0x82f */
50 [ 0x830/8 ... 0x837/8 ] = 0xfd, /* 0x830, 0x832 - 0x837 */
51 [ 0x838/8 ... 0x83f/8 ] = 0x43, /* 0x838, 0x839, 0x83e */
52 [ 0x840/8 ... 0x1fff/8 ] = 0,
54 [ VMX_MSR_BMP_C000_READ ] = {
55 [ 0/8 ... 0x1fff/8 ] = 0,
57 [ VMX_MSR_BMP_0000_WRITE ] = {
58 [ 0/8 ... 0x17/8 ] = 0,
59 [ 0x18/8 ... 0x1f/8 ] = 0x08, /* 0x01b */
60 [ 0x20/8 ... 0x1ff/8 ] = 0,
61 [ 0x200/8 ... 0x277/8 ] = 0xff, /* 0x200 - 0x277 */
62 [ 0x278/8 ... 0x2f7/8 ] = 0,
63 [ 0x2f8/8 ... 0x2ff/8 ] = 0x80, /* 0x2ff */
64 [ 0x300/8 ... 0x387/8 ] = 0,
65 [ 0x388/8 ... 0x38f/8 ] = 0x80, /* 0x38f */
66 [ 0x390/8 ... 0x7ff/8 ] = 0,
67 [ 0x808/8 ... 0x80f/8 ] = 0x89, /* 0x808, 0x80b, 0x80f */
68 [ 0x810/8 ... 0x827/8 ] = 0,
69 [ 0x828/8 ... 0x82f/8 ] = 0x81, /* 0x828, 0x82f */
70 [ 0x830/8 ... 0x837/8 ] = 0xfd, /* 0x830, 0x832 - 0x837 */
71 [ 0x838/8 ... 0x83f/8 ] = 0xc1, /* 0x838, 0x83e, 0x83f */
72 [ 0x840/8 ... 0x1fff/8 ] = 0,
74 [ VMX_MSR_BMP_C000_WRITE ] = {
75 [ 0/8 ... 0x1fff/8 ] = 0,
78 static u8 __attribute__((aligned(PAGE_SIZE))) apic_access_page[PAGE_SIZE];
79 static struct paging ept_paging[EPT_PAGE_DIR_LEVELS];
80 static u32 enable_rdtscp;
81 static unsigned long cr_maybe1[2], cr_required1[2];
83 static bool vmxon(struct per_cpu *cpu_data)
85 unsigned long vmxon_addr;
88 vmxon_addr = paging_hvirt2phys(&cpu_data->vmxon_region);
93 : "r" (&vmxon_addr), "m" (vmxon_addr)
98 static bool vmcs_clear(struct per_cpu *cpu_data)
100 unsigned long vmcs_addr = paging_hvirt2phys(&cpu_data->vmcs);
107 : "r" (&vmcs_addr), "m" (vmcs_addr)
112 static bool vmcs_load(struct per_cpu *cpu_data)
114 unsigned long vmcs_addr = paging_hvirt2phys(&cpu_data->vmcs);
121 : "r" (&vmcs_addr), "m" (vmcs_addr)
126 static inline unsigned long vmcs_read64(unsigned long field)
130 asm volatile("vmread %1,%0" : "=r" (value) : "r" (field) : "cc");
134 static inline u16 vmcs_read16(unsigned long field)
136 return vmcs_read64(field);
139 static inline u32 vmcs_read32(unsigned long field)
141 return vmcs_read64(field);
144 static bool vmcs_write64(unsigned long field, unsigned long val)
152 : "r" (val), "r" (field)
155 printk("FATAL: vmwrite %08lx failed, error %d, caller %p\n",
156 field, vmcs_read32(VM_INSTRUCTION_ERROR),
157 __builtin_return_address(0));
161 static bool vmcs_write16(unsigned long field, u16 value)
163 return vmcs_write64(field, value);
166 static bool vmcs_write32(unsigned long field, u32 value)
168 return vmcs_write64(field, value);
171 static bool vmx_define_cr_restrictions(unsigned int cr_idx,
172 unsigned long maybe1,
173 unsigned long required1)
175 if (!cr_maybe1[cr_idx]) {
176 cr_maybe1[cr_idx] = maybe1;
177 cr_required1[cr_idx] = required1;
181 return cr_maybe1[cr_idx] == maybe1 &&
182 cr_required1[cr_idx] == required1;
185 static int vmx_check_features(void)
187 unsigned long vmx_proc_ctrl, vmx_proc_ctrl2, ept_cap;
188 unsigned long vmx_pin_ctrl, vmx_basic, maybe1, required1;
189 unsigned long vmx_entry_ctrl, vmx_exit_ctrl;
191 if (!(cpuid_ecx(1) & X86_FEATURE_VMX))
192 return trace_error(-ENODEV);
194 vmx_basic = read_msr(MSR_IA32_VMX_BASIC);
196 /* require VMCS size <= PAGE_SIZE,
197 * VMCS memory access type == write back and
198 * availability of TRUE_*_CTLS */
199 if (((vmx_basic >> 32) & 0x1fff) > PAGE_SIZE ||
200 ((vmx_basic >> 50) & 0xf) != EPT_TYPE_WRITEBACK ||
201 !(vmx_basic & (1UL << 55)))
202 return trace_error(-EIO);
204 /* require NMI exiting and preemption timer support */
205 vmx_pin_ctrl = read_msr(MSR_IA32_VMX_PINBASED_CTLS) >> 32;
206 if (!(vmx_pin_ctrl & PIN_BASED_NMI_EXITING) ||
207 !(vmx_pin_ctrl & PIN_BASED_VMX_PREEMPTION_TIMER))
208 return trace_error(-EIO);
210 /* require I/O and MSR bitmap as well as secondary controls support */
211 vmx_proc_ctrl = read_msr(MSR_IA32_VMX_PROCBASED_CTLS) >> 32;
212 if (!(vmx_proc_ctrl & CPU_BASED_USE_IO_BITMAPS) ||
213 !(vmx_proc_ctrl & CPU_BASED_USE_MSR_BITMAPS) ||
214 !(vmx_proc_ctrl & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
215 return trace_error(-EIO);
217 /* require disabling of CR3 access interception */
218 vmx_proc_ctrl = read_msr(MSR_IA32_VMX_TRUE_PROCBASED_CTLS);
220 (CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING))
221 return trace_error(-EIO);
223 /* require APIC access, EPT and unrestricted guest mode support */
224 vmx_proc_ctrl2 = read_msr(MSR_IA32_VMX_PROCBASED_CTLS2) >> 32;
225 ept_cap = read_msr(MSR_IA32_VMX_EPT_VPID_CAP);
226 if (!(vmx_proc_ctrl2 & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) ||
227 !(vmx_proc_ctrl2 & SECONDARY_EXEC_ENABLE_EPT) ||
228 (ept_cap & EPT_MANDATORY_FEATURES) != EPT_MANDATORY_FEATURES ||
229 !(ept_cap & (EPT_INVEPT_SINGLE | EPT_INVEPT_GLOBAL)) ||
230 !(vmx_proc_ctrl2 & SECONDARY_EXEC_UNRESTRICTED_GUEST))
231 return trace_error(-EIO);
233 /* require RDTSCP if present in CPUID */
234 if (cpuid_edx(0x80000001) & X86_FEATURE_RDTSCP) {
235 enable_rdtscp = SECONDARY_EXEC_RDTSCP;
236 if (!(vmx_proc_ctrl2 & SECONDARY_EXEC_RDTSCP))
237 return trace_error(-EIO);
240 /* require PAT and EFER save/restore */
241 vmx_entry_ctrl = read_msr(MSR_IA32_VMX_ENTRY_CTLS) >> 32;
242 vmx_exit_ctrl = read_msr(MSR_IA32_VMX_EXIT_CTLS) >> 32;
243 if (!(vmx_entry_ctrl & VM_ENTRY_LOAD_IA32_PAT) ||
244 !(vmx_entry_ctrl & VM_ENTRY_LOAD_IA32_EFER) ||
245 !(vmx_exit_ctrl & VM_EXIT_SAVE_IA32_PAT) ||
246 !(vmx_exit_ctrl & VM_EXIT_LOAD_IA32_PAT) ||
247 !(vmx_exit_ctrl & VM_EXIT_SAVE_IA32_EFER) ||
248 !(vmx_exit_ctrl & VM_EXIT_LOAD_IA32_EFER))
249 return trace_error(-EIO);
251 /* require activity state HLT */
252 if (!(read_msr(MSR_IA32_VMX_MISC) & VMX_MISC_ACTIVITY_HLT))
253 return trace_error(-EIO);
256 * Retrieve/validate restrictions on CR0
258 * In addition to what the VMX MSRs tell us, make sure that
259 * - NW and CD are kept off as they are not updated on VM exit and we
260 * don't want them enabled for performance reasons while in root mode
261 * - PE and PG can be freely chosen (by the guest) because we demand
262 * unrestricted guest mode support anyway
263 * - ET is always on (architectural requirement)
265 maybe1 = read_msr(MSR_IA32_VMX_CR0_FIXED1) &
266 ~(X86_CR0_NW | X86_CR0_CD);
267 required1 = (read_msr(MSR_IA32_VMX_CR0_FIXED0) &
268 ~(X86_CR0_PE | X86_CR0_PG)) | X86_CR0_ET;
269 if (!vmx_define_cr_restrictions(CR0_IDX, maybe1, required1))
270 return trace_error(-EIO);
272 /* Retrieve/validate restrictions on CR4 */
273 maybe1 = read_msr(MSR_IA32_VMX_CR4_FIXED1);
274 required1 = read_msr(MSR_IA32_VMX_CR4_FIXED0);
275 if (!vmx_define_cr_restrictions(CR4_IDX, maybe1, required1))
276 return trace_error(-EIO);
281 static void ept_set_next_pt(pt_entry_t pte, unsigned long next_pt)
283 *pte = (next_pt & 0x000ffffffffff000UL) | EPT_FLAG_READ |
284 EPT_FLAG_WRITE | EPT_FLAG_EXECUTE;
287 int vcpu_vendor_init(void)
292 err = vmx_check_features();
296 /* derive ept_paging from very similar x86_64_paging */
297 memcpy(ept_paging, x86_64_paging, sizeof(ept_paging));
298 for (n = 0; n < EPT_PAGE_DIR_LEVELS; n++)
299 ept_paging[n].set_next_pt = ept_set_next_pt;
300 if (!(read_msr(MSR_IA32_VMX_EPT_VPID_CAP) & EPT_1G_PAGES))
301 ept_paging[1].page_size = 0;
302 if (!(read_msr(MSR_IA32_VMX_EPT_VPID_CAP) & EPT_2M_PAGES))
303 ept_paging[2].page_size = 0;
306 /* allow direct x2APIC access except for ICR writes */
307 memset(&msr_bitmap[VMX_MSR_BMP_0000_READ][MSR_X2APIC_BASE/8],
308 0, (MSR_X2APIC_END - MSR_X2APIC_BASE + 1)/8);
309 memset(&msr_bitmap[VMX_MSR_BMP_0000_WRITE][MSR_X2APIC_BASE/8],
310 0, (MSR_X2APIC_END - MSR_X2APIC_BASE + 1)/8);
311 msr_bitmap[VMX_MSR_BMP_0000_WRITE][MSR_X2APIC_ICR/8] = 0x01;
314 return vcpu_cell_init(&root_cell);
317 unsigned long arch_paging_gphys2phys(struct per_cpu *cpu_data,
318 unsigned long gphys, unsigned long flags)
320 return paging_virt2phys(&cpu_data->cell->vmx.ept_structs, gphys,
324 int vcpu_vendor_cell_init(struct cell *cell)
328 /* allocate io_bitmap */
329 cell->vmx.io_bitmap = page_alloc(&mem_pool, 2);
330 if (!cell->vmx.io_bitmap)
333 /* build root EPT of cell */
334 cell->vmx.ept_structs.root_paging = ept_paging;
335 cell->vmx.ept_structs.root_table = (page_table_t)cell->root_table_page;
337 err = paging_create(&cell->vmx.ept_structs,
338 paging_hvirt2phys(apic_access_page),
339 PAGE_SIZE, XAPIC_BASE,
340 EPT_FLAG_READ | EPT_FLAG_WRITE | EPT_FLAG_WB_TYPE,
341 PAGING_NON_COHERENT);
343 goto err_free_io_bitmap;
348 page_free(&mem_pool, cell->vmx.io_bitmap, 2);
353 int vcpu_map_memory_region(struct cell *cell,
354 const struct jailhouse_memory *mem)
356 u64 phys_start = mem->phys_start;
357 u32 flags = EPT_FLAG_WB_TYPE;
359 if (mem->flags & JAILHOUSE_MEM_READ)
360 flags |= EPT_FLAG_READ;
361 if (mem->flags & JAILHOUSE_MEM_WRITE)
362 flags |= EPT_FLAG_WRITE;
363 if (mem->flags & JAILHOUSE_MEM_EXECUTE)
364 flags |= EPT_FLAG_EXECUTE;
365 if (mem->flags & JAILHOUSE_MEM_COMM_REGION)
366 phys_start = paging_hvirt2phys(&cell->comm_page);
368 return paging_create(&cell->vmx.ept_structs, phys_start, mem->size,
369 mem->virt_start, flags, PAGING_NON_COHERENT);
372 int vcpu_unmap_memory_region(struct cell *cell,
373 const struct jailhouse_memory *mem)
375 return paging_destroy(&cell->vmx.ept_structs, mem->virt_start,
376 mem->size, PAGING_NON_COHERENT);
379 void vcpu_vendor_cell_exit(struct cell *cell)
381 paging_destroy(&cell->vmx.ept_structs, XAPIC_BASE, PAGE_SIZE,
382 PAGING_NON_COHERENT);
383 page_free(&mem_pool, cell->vmx.io_bitmap, 2);
386 void vcpu_tlb_flush(void)
388 unsigned long ept_cap = read_msr(MSR_IA32_VMX_EPT_VPID_CAP);
396 descriptor.reserved = 0;
397 if (ept_cap & EPT_INVEPT_SINGLE) {
398 type = VMX_INVEPT_SINGLE;
399 descriptor.eptp = vmcs_read64(EPT_POINTER);
401 type = VMX_INVEPT_GLOBAL;
408 : "r" (&descriptor), "r" (type)
412 panic_printk("FATAL: invept failed, error %d\n",
413 vmcs_read32(VM_INSTRUCTION_ERROR));
418 static bool vmx_set_guest_cr(unsigned int cr_idx, unsigned long val)
423 val |= X86_CR4_VMXE; /* keeps the hypervisor visible */
425 ok &= vmcs_write64(cr_idx ? GUEST_CR4 : GUEST_CR0,
426 (val & cr_maybe1[cr_idx]) | cr_required1[cr_idx]);
427 ok &= vmcs_write64(cr_idx ? CR4_READ_SHADOW : CR0_READ_SHADOW, val);
428 ok &= vmcs_write64(cr_idx ? CR4_GUEST_HOST_MASK : CR0_GUEST_HOST_MASK,
429 cr_required1[cr_idx] | ~cr_maybe1[cr_idx]);
434 static bool vmx_set_cell_config(void)
436 struct cell *cell = this_cell();
440 io_bitmap = cell->vmx.io_bitmap;
441 ok &= vmcs_write64(IO_BITMAP_A, paging_hvirt2phys(io_bitmap));
442 ok &= vmcs_write64(IO_BITMAP_B,
443 paging_hvirt2phys(io_bitmap + PAGE_SIZE));
445 ok &= vmcs_write64(EPT_POINTER,
446 paging_hvirt2phys(cell->vmx.ept_structs.root_table) |
447 EPT_TYPE_WRITEBACK | EPT_PAGE_WALK_LEN);
452 static bool vmx_set_guest_segment(const struct segment *seg,
453 unsigned long selector_field)
457 ok &= vmcs_write16(selector_field, seg->selector);
458 ok &= vmcs_write64(selector_field + GUEST_SEG_BASE, seg->base);
459 ok &= vmcs_write32(selector_field + GUEST_SEG_LIMIT, seg->limit);
460 ok &= vmcs_write32(selector_field + GUEST_SEG_AR_BYTES,
465 static bool vmcs_setup(struct per_cpu *cpu_data)
467 struct desc_table_reg dtr;
471 ok &= vmcs_write64(HOST_CR0, read_cr0());
472 ok &= vmcs_write64(HOST_CR3, read_cr3());
473 ok &= vmcs_write64(HOST_CR4, read_cr4());
475 ok &= vmcs_write16(HOST_CS_SELECTOR, GDT_DESC_CODE * 8);
476 ok &= vmcs_write16(HOST_DS_SELECTOR, 0);
477 ok &= vmcs_write16(HOST_ES_SELECTOR, 0);
478 ok &= vmcs_write16(HOST_SS_SELECTOR, 0);
479 ok &= vmcs_write16(HOST_FS_SELECTOR, 0);
480 ok &= vmcs_write16(HOST_GS_SELECTOR, 0);
481 ok &= vmcs_write16(HOST_TR_SELECTOR, GDT_DESC_TSS * 8);
483 ok &= vmcs_write64(HOST_FS_BASE, 0);
484 ok &= vmcs_write64(HOST_GS_BASE, read_msr(MSR_GS_BASE));
485 ok &= vmcs_write64(HOST_TR_BASE, 0);
488 ok &= vmcs_write64(HOST_GDTR_BASE, dtr.base);
490 ok &= vmcs_write64(HOST_IDTR_BASE, dtr.base);
492 ok &= vmcs_write64(HOST_IA32_PAT, read_msr(MSR_IA32_PAT));
493 ok &= vmcs_write64(HOST_IA32_EFER, EFER_LMA | EFER_LME);
495 ok &= vmcs_write32(HOST_IA32_SYSENTER_CS, 0);
496 ok &= vmcs_write64(HOST_IA32_SYSENTER_EIP, 0);
497 ok &= vmcs_write64(HOST_IA32_SYSENTER_ESP, 0);
499 ok &= vmcs_write64(HOST_RSP, (unsigned long)cpu_data->stack +
500 sizeof(cpu_data->stack));
501 ok &= vmcs_write64(HOST_RIP, (unsigned long)vmx_vmexit);
503 ok &= vmx_set_guest_cr(CR0_IDX, cpu_data->linux_cr0);
504 ok &= vmx_set_guest_cr(CR4_IDX, cpu_data->linux_cr4);
506 ok &= vmcs_write64(GUEST_CR3, cpu_data->linux_cr3);
508 ok &= vmx_set_guest_segment(&cpu_data->linux_cs, GUEST_CS_SELECTOR);
509 ok &= vmx_set_guest_segment(&cpu_data->linux_ds, GUEST_DS_SELECTOR);
510 ok &= vmx_set_guest_segment(&cpu_data->linux_es, GUEST_ES_SELECTOR);
511 ok &= vmx_set_guest_segment(&cpu_data->linux_fs, GUEST_FS_SELECTOR);
512 ok &= vmx_set_guest_segment(&cpu_data->linux_gs, GUEST_GS_SELECTOR);
513 ok &= vmx_set_guest_segment(&invalid_seg, GUEST_SS_SELECTOR);
514 ok &= vmx_set_guest_segment(&cpu_data->linux_tss, GUEST_TR_SELECTOR);
515 ok &= vmx_set_guest_segment(&invalid_seg, GUEST_LDTR_SELECTOR);
517 ok &= vmcs_write64(GUEST_GDTR_BASE, cpu_data->linux_gdtr.base);
518 ok &= vmcs_write32(GUEST_GDTR_LIMIT, cpu_data->linux_gdtr.limit);
519 ok &= vmcs_write64(GUEST_IDTR_BASE, cpu_data->linux_idtr.base);
520 ok &= vmcs_write32(GUEST_IDTR_LIMIT, cpu_data->linux_idtr.limit);
522 ok &= vmcs_write64(GUEST_RFLAGS, 0x02);
523 ok &= vmcs_write64(GUEST_RSP, cpu_data->linux_sp +
524 (NUM_ENTRY_REGS + 1) * sizeof(unsigned long));
525 ok &= vmcs_write64(GUEST_RIP, cpu_data->linux_ip);
527 ok &= vmcs_write32(GUEST_SYSENTER_CS,
528 read_msr(MSR_IA32_SYSENTER_CS));
529 ok &= vmcs_write64(GUEST_SYSENTER_EIP,
530 read_msr(MSR_IA32_SYSENTER_EIP));
531 ok &= vmcs_write64(GUEST_SYSENTER_ESP,
532 read_msr(MSR_IA32_SYSENTER_ESP));
534 ok &= vmcs_write64(GUEST_DR7, 0x00000400);
535 ok &= vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
537 ok &= vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
538 ok &= vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
539 ok &= vmcs_write64(GUEST_PENDING_DBG_EXCEPTIONS, 0);
541 ok &= vmcs_write64(GUEST_IA32_PAT, cpu_data->pat);
542 ok &= vmcs_write64(GUEST_IA32_EFER, cpu_data->linux_efer);
544 ok &= vmcs_write64(VMCS_LINK_POINTER, -1UL);
545 ok &= vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
547 val = read_msr(MSR_IA32_VMX_PINBASED_CTLS);
548 val |= PIN_BASED_NMI_EXITING;
549 ok &= vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, val);
551 ok &= vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0);
553 val = read_msr(MSR_IA32_VMX_PROCBASED_CTLS);
554 val |= CPU_BASED_USE_IO_BITMAPS | CPU_BASED_USE_MSR_BITMAPS |
555 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
556 val &= ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
557 ok &= vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, val);
559 ok &= vmcs_write64(MSR_BITMAP, paging_hvirt2phys(msr_bitmap));
561 val = read_msr(MSR_IA32_VMX_PROCBASED_CTLS2);
562 val |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
563 SECONDARY_EXEC_ENABLE_EPT | SECONDARY_EXEC_UNRESTRICTED_GUEST |
565 ok &= vmcs_write32(SECONDARY_VM_EXEC_CONTROL, val);
567 ok &= vmcs_write64(APIC_ACCESS_ADDR,
568 paging_hvirt2phys(apic_access_page));
570 ok &= vmx_set_cell_config();
572 ok &= vmcs_write32(EXCEPTION_BITMAP, 0);
574 val = read_msr(MSR_IA32_VMX_EXIT_CTLS);
575 val |= VM_EXIT_HOST_ADDR_SPACE_SIZE |
576 VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT |
577 VM_EXIT_SAVE_IA32_EFER | VM_EXIT_LOAD_IA32_EFER;
578 ok &= vmcs_write32(VM_EXIT_CONTROLS, val);
580 ok &= vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
581 ok &= vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
582 ok &= vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
584 val = read_msr(MSR_IA32_VMX_ENTRY_CTLS);
585 val |= VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_PAT |
586 VM_ENTRY_LOAD_IA32_EFER;
587 ok &= vmcs_write32(VM_ENTRY_CONTROLS, val);
589 ok &= vmcs_write64(CR4_GUEST_HOST_MASK, 0);
591 ok &= vmcs_write32(CR3_TARGET_COUNT, 0);
596 int vcpu_init(struct per_cpu *cpu_data)
598 unsigned long feature_ctrl, mask;
602 /* make sure all perf counters are off */
603 if ((cpuid_eax(0x0a) & 0xff) > 0)
604 write_msr(MSR_IA32_PERF_GLOBAL_CTRL, 0);
606 if (cpu_data->linux_cr4 & X86_CR4_VMXE)
607 return trace_error(-EBUSY);
609 err = vmx_check_features();
613 revision_id = (u32)read_msr(MSR_IA32_VMX_BASIC);
614 cpu_data->vmxon_region.revision_id = revision_id;
615 cpu_data->vmxon_region.shadow_indicator = 0;
616 cpu_data->vmcs.revision_id = revision_id;
617 cpu_data->vmcs.shadow_indicator = 0;
619 /* Note: We assume that TXT is off */
620 feature_ctrl = read_msr(MSR_IA32_FEATURE_CONTROL);
621 mask = FEATURE_CONTROL_LOCKED |
622 FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
624 if ((feature_ctrl & mask) != mask) {
625 if (feature_ctrl & FEATURE_CONTROL_LOCKED)
626 return trace_error(-ENODEV);
628 feature_ctrl |= mask;
629 write_msr(MSR_IA32_FEATURE_CONTROL, feature_ctrl);
633 * SDM Volume 3, 2.5: "When loading a control register, reserved bits
634 * should always be set to the values previously read."
635 * But we want to avoid surprises with new features unknown to us but
636 * set by Linux. So check if any assumed revered bit was set or should
637 * be set for VMX operation and bail out if so.
639 if ((cpu_data->linux_cr0 | cr_required1[CR0_IDX]) & X86_CR0_RESERVED ||
640 (cpu_data->linux_cr4 | cr_required1[CR4_IDX]) & X86_CR4_RESERVED)
643 * Bring CR0 and CR4 into well-defined states. If they do not match
644 * with VMX requirements, vmxon will fail.
645 * X86_CR4_OSXSAVE is enabled if available so that xsetbv can be
646 * executed on behalf of a cell.
648 write_cr0(X86_CR0_HOST_STATE);
649 write_cr4(X86_CR4_HOST_STATE | X86_CR4_VMXE |
650 ((cpuid_ecx(1) & X86_FEATURE_XSAVE) ? X86_CR4_OSXSAVE : 0));
652 if (!vmxon(cpu_data)) {
653 write_cr4(cpu_data->linux_cr4);
654 return trace_error(-EIO);
657 cpu_data->vmx_state = VMXON;
659 if (!vmcs_clear(cpu_data) ||
660 !vmcs_load(cpu_data) ||
661 !vmcs_setup(cpu_data))
662 return trace_error(-EIO);
664 cpu_data->vmx_state = VMCS_READY;
669 void vcpu_exit(struct per_cpu *cpu_data)
671 if (cpu_data->vmx_state == VMXOFF)
674 cpu_data->vmx_state = VMXOFF;
675 /* Write vmx_state to ensure that vcpu_nmi_handler stops accessing
676 * the VMCS (a compiler barrier would be sufficient, in fact). */
679 vmcs_clear(cpu_data);
680 asm volatile("vmxoff" : : : "cc");
681 cpu_data->linux_cr4 &= ~X86_CR4_VMXE;
684 void __attribute__((noreturn)) vcpu_activate_vmm(struct per_cpu *cpu_data)
686 /* We enter Linux at the point arch_entry would return to as well.
687 * rax is cleared to signal success to the caller. */
689 "mov (%%rdi),%%r15\n\t"
690 "mov 0x8(%%rdi),%%r14\n\t"
691 "mov 0x10(%%rdi),%%r13\n\t"
692 "mov 0x18(%%rdi),%%r12\n\t"
693 "mov 0x20(%%rdi),%%rbx\n\t"
694 "mov 0x28(%%rdi),%%rbp\n\t"
698 : "a" (0), "D" (cpu_data->linux_reg)
699 : "memory", "r15", "r14", "r13", "r12", "rbx", "rbp", "cc");
701 panic_printk("FATAL: vmlaunch failed, error %d\n",
702 vmcs_read32(VM_INSTRUCTION_ERROR));
706 void __attribute__((noreturn)) vcpu_deactivate_vmm(void)
708 unsigned long *stack = (unsigned long *)vmcs_read64(GUEST_RSP);
709 unsigned long linux_ip = vmcs_read64(GUEST_RIP);
710 struct per_cpu *cpu_data = this_cpu_data();
712 cpu_data->linux_cr0 = vmcs_read64(GUEST_CR0);
713 cpu_data->linux_cr3 = vmcs_read64(GUEST_CR3);
714 cpu_data->linux_cr4 = vmcs_read64(GUEST_CR4);
716 cpu_data->linux_gdtr.base = vmcs_read64(GUEST_GDTR_BASE);
717 cpu_data->linux_gdtr.limit = vmcs_read64(GUEST_GDTR_LIMIT);
718 cpu_data->linux_idtr.base = vmcs_read64(GUEST_IDTR_BASE);
719 cpu_data->linux_idtr.limit = vmcs_read64(GUEST_IDTR_LIMIT);
721 cpu_data->linux_cs.selector = vmcs_read32(GUEST_CS_SELECTOR);
723 cpu_data->linux_tss.selector = vmcs_read32(GUEST_TR_SELECTOR);
725 cpu_data->linux_efer = vmcs_read64(GUEST_IA32_EFER);
726 cpu_data->linux_fs.base = vmcs_read64(GUEST_FS_BASE);
727 cpu_data->linux_gs.base = vmcs_read64(GUEST_GS_BASE);
729 write_msr(MSR_IA32_SYSENTER_CS, vmcs_read32(GUEST_SYSENTER_CS));
730 write_msr(MSR_IA32_SYSENTER_EIP, vmcs_read64(GUEST_SYSENTER_EIP));
731 write_msr(MSR_IA32_SYSENTER_ESP, vmcs_read64(GUEST_SYSENTER_ESP));
733 cpu_data->linux_ds.selector = vmcs_read16(GUEST_DS_SELECTOR);
734 cpu_data->linux_es.selector = vmcs_read16(GUEST_ES_SELECTOR);
735 cpu_data->linux_fs.selector = vmcs_read16(GUEST_FS_SELECTOR);
736 cpu_data->linux_gs.selector = vmcs_read16(GUEST_GS_SELECTOR);
738 arch_cpu_restore(cpu_data, 0);
744 "mov %%rbx,%%rsp\n\t"
760 "mov %%rax,%%rsp\n\t"
761 "xor %%rax,%%rax\n\t"
763 : : "a" (stack), "b" (&cpu_data->guest_regs));
764 __builtin_unreachable();
767 static void vmx_vcpu_reset(unsigned int sipi_vector)
772 ok &= vmx_set_guest_cr(CR0_IDX, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
773 ok &= vmx_set_guest_cr(CR4_IDX, 0);
775 ok &= vmcs_write64(GUEST_CR3, 0);
777 ok &= vmcs_write64(GUEST_RFLAGS, 0x02);
778 ok &= vmcs_write64(GUEST_RSP, 0);
781 if (sipi_vector == APIC_BSP_PSEUDO_SIPI) {
785 /* only cleared on hard reset */
786 ok &= vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
788 ok &= vmcs_write64(GUEST_RIP, val);
790 ok &= vmcs_write16(GUEST_CS_SELECTOR, sipi_vector << 8);
791 ok &= vmcs_write64(GUEST_CS_BASE, sipi_vector << 12);
792 ok &= vmcs_write32(GUEST_CS_LIMIT, 0xffff);
793 ok &= vmcs_write32(GUEST_CS_AR_BYTES, 0x0009b);
795 ok &= vmcs_write16(GUEST_DS_SELECTOR, 0);
796 ok &= vmcs_write64(GUEST_DS_BASE, 0);
797 ok &= vmcs_write32(GUEST_DS_LIMIT, 0xffff);
798 ok &= vmcs_write32(GUEST_DS_AR_BYTES, 0x00093);
800 ok &= vmcs_write16(GUEST_ES_SELECTOR, 0);
801 ok &= vmcs_write64(GUEST_ES_BASE, 0);
802 ok &= vmcs_write32(GUEST_ES_LIMIT, 0xffff);
803 ok &= vmcs_write32(GUEST_ES_AR_BYTES, 0x00093);
805 ok &= vmcs_write16(GUEST_FS_SELECTOR, 0);
806 ok &= vmcs_write64(GUEST_FS_BASE, 0);
807 ok &= vmcs_write32(GUEST_FS_LIMIT, 0xffff);
808 ok &= vmcs_write32(GUEST_FS_AR_BYTES, 0x00093);
810 ok &= vmcs_write16(GUEST_GS_SELECTOR, 0);
811 ok &= vmcs_write64(GUEST_GS_BASE, 0);
812 ok &= vmcs_write32(GUEST_GS_LIMIT, 0xffff);
813 ok &= vmcs_write32(GUEST_GS_AR_BYTES, 0x00093);
815 ok &= vmcs_write16(GUEST_SS_SELECTOR, 0);
816 ok &= vmcs_write64(GUEST_SS_BASE, 0);
817 ok &= vmcs_write32(GUEST_SS_LIMIT, 0xffff);
818 ok &= vmcs_write32(GUEST_SS_AR_BYTES, 0x00093);
820 ok &= vmcs_write16(GUEST_TR_SELECTOR, 0);
821 ok &= vmcs_write64(GUEST_TR_BASE, 0);
822 ok &= vmcs_write32(GUEST_TR_LIMIT, 0xffff);
823 ok &= vmcs_write32(GUEST_TR_AR_BYTES, 0x0008b);
825 ok &= vmcs_write16(GUEST_LDTR_SELECTOR, 0);
826 ok &= vmcs_write64(GUEST_LDTR_BASE, 0);
827 ok &= vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
828 ok &= vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
830 ok &= vmcs_write64(GUEST_GDTR_BASE, 0);
831 ok &= vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
832 ok &= vmcs_write64(GUEST_IDTR_BASE, 0);
833 ok &= vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
835 ok &= vmcs_write64(GUEST_IA32_EFER, 0);
837 ok &= vmcs_write32(GUEST_SYSENTER_CS, 0);
838 ok &= vmcs_write64(GUEST_SYSENTER_EIP, 0);
839 ok &= vmcs_write64(GUEST_SYSENTER_ESP, 0);
841 ok &= vmcs_write64(GUEST_DR7, 0x00000400);
843 ok &= vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
844 ok &= vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
845 ok &= vmcs_write64(GUEST_PENDING_DBG_EXCEPTIONS, 0);
847 val = vmcs_read32(VM_ENTRY_CONTROLS);
848 val &= ~VM_ENTRY_IA32E_MODE;
849 ok &= vmcs_write32(VM_ENTRY_CONTROLS, val);
851 ok &= vmx_set_cell_config();
854 panic_printk("FATAL: CPU reset failed\n");
859 void vcpu_nmi_handler(void)
863 if (this_cpu_data()->vmx_state != VMCS_READY)
866 pin_based_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
867 pin_based_ctrl |= PIN_BASED_VMX_PREEMPTION_TIMER;
868 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, pin_based_ctrl);
874 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_HLT);
877 static void vmx_disable_preemption_timer(void)
879 u32 pin_based_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
881 pin_based_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
882 vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, pin_based_ctrl);
885 void vcpu_skip_emulated_instruction(unsigned int inst_len)
887 vmcs_write64(GUEST_RIP, vmcs_read64(GUEST_RIP) + inst_len);
890 static void update_efer(void)
892 unsigned long efer = vmcs_read64(GUEST_IA32_EFER);
894 if ((efer & (EFER_LME | EFER_LMA)) != EFER_LME)
898 vmcs_write64(GUEST_IA32_EFER, efer);
899 vmcs_write32(VM_ENTRY_CONTROLS,
900 vmcs_read32(VM_ENTRY_CONTROLS) | VM_ENTRY_IA32E_MODE);
903 static bool vmx_handle_cr(void)
905 u64 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
906 unsigned long cr, reg, val;
908 cr = exit_qualification & 0xf;
909 reg = (exit_qualification >> 8) & 0xf;
911 switch ((exit_qualification >> 4) & 3) {
912 case 0: /* move to cr */
914 val = vmcs_read64(GUEST_RSP);
916 val = this_cpu_data()->guest_regs.by_index[15 - reg];
918 if (cr == 0 || cr == 4) {
919 vcpu_skip_emulated_instruction(X86_INST_LEN_MOV_TO_CR);
920 /* TODO: check for #GP reasons */
921 vmx_set_guest_cr(cr ? CR4_IDX : CR0_IDX, val);
922 if (cr == 0 && val & X86_CR0_PG)
930 panic_printk("FATAL: Unhandled CR access, qualification %x\n",
935 bool vcpu_get_guest_paging_structs(struct guest_paging_structures *pg_structs)
937 if (vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE) {
938 pg_structs->root_paging = x86_64_paging;
939 pg_structs->root_table_gphys =
940 vmcs_read64(GUEST_CR3) & 0x000ffffffffff000UL;
941 } else if (vmcs_read64(GUEST_CR0) & X86_CR0_PG &&
942 !(vmcs_read64(GUEST_CR4) & X86_CR4_PAE)) {
943 pg_structs->root_paging = i386_paging;
944 pg_structs->root_table_gphys =
945 vmcs_read64(GUEST_CR3) & 0xfffff000UL;
947 printk("FATAL: Unsupported paging mode\n");
953 void vcpu_vendor_set_guest_pat(unsigned long val)
955 vmcs_write64(GUEST_IA32_PAT, val);
958 static bool vmx_handle_apic_access(void)
960 struct guest_paging_structures pg_structs;
961 unsigned int inst_len, offset;
965 qualification = vmcs_read64(EXIT_QUALIFICATION);
967 switch (qualification & APIC_ACCESS_TYPE_MASK) {
968 case APIC_ACCESS_TYPE_LINEAR_READ:
969 case APIC_ACCESS_TYPE_LINEAR_WRITE:
970 is_write = !!(qualification & APIC_ACCESS_TYPE_LINEAR_WRITE);
971 offset = qualification & APIC_ACCESS_OFFSET_MASK;
975 if (!vcpu_get_guest_paging_structs(&pg_structs))
978 inst_len = apic_mmio_access(vmcs_read64(GUEST_RIP),
979 &pg_structs, offset >> 4,
984 vcpu_skip_emulated_instruction(inst_len);
987 panic_printk("FATAL: Unhandled APIC access, "
988 "qualification %x\n", qualification);
992 static void dump_vm_exit_details(u32 reason)
994 panic_printk("qualification %x\n", vmcs_read64(EXIT_QUALIFICATION));
995 panic_printk("vectoring info: %x interrupt info: %x\n",
996 vmcs_read32(IDT_VECTORING_INFO_FIELD),
997 vmcs_read32(VM_EXIT_INTR_INFO));
998 if (reason == EXIT_REASON_EPT_VIOLATION ||
999 reason == EXIT_REASON_EPT_MISCONFIG)
1000 panic_printk("guest phys addr %p guest linear addr: %p\n",
1001 vmcs_read64(GUEST_PHYSICAL_ADDRESS),
1002 vmcs_read64(GUEST_LINEAR_ADDRESS));
1005 static void dump_guest_regs(union registers *guest_regs)
1007 panic_printk("RIP: %p RSP: %p FLAGS: %x\n", vmcs_read64(GUEST_RIP),
1008 vmcs_read64(GUEST_RSP), vmcs_read64(GUEST_RFLAGS));
1009 panic_printk("RAX: %p RBX: %p RCX: %p\n", guest_regs->rax,
1010 guest_regs->rbx, guest_regs->rcx);
1011 panic_printk("RDX: %p RSI: %p RDI: %p\n", guest_regs->rdx,
1012 guest_regs->rsi, guest_regs->rdi);
1013 panic_printk("CS: %x BASE: %p AR-BYTES: %x EFER.LMA %d\n",
1014 vmcs_read64(GUEST_CS_SELECTOR),
1015 vmcs_read64(GUEST_CS_BASE),
1016 vmcs_read32(GUEST_CS_AR_BYTES),
1017 !!(vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE));
1018 panic_printk("CR0: %p CR3: %p CR4: %p\n", vmcs_read64(GUEST_CR0),
1019 vmcs_read64(GUEST_CR3), vmcs_read64(GUEST_CR4));
1020 panic_printk("EFER: %p\n", vmcs_read64(GUEST_IA32_EFER));
1023 void vcpu_vendor_get_io_intercept(struct vcpu_io_intercept *io)
1025 u64 exitq = vmcs_read64(EXIT_QUALIFICATION);
1027 /* parse exit qualification for I/O instructions (see SDM, 27.2.1 ) */
1028 io->port = (exitq >> 16) & 0xFFFF;
1029 io->size = (exitq & 0x3) + 1;
1030 io->in = !!((exitq & 0x8) >> 3);
1031 io->inst_len = vmcs_read64(VM_EXIT_INSTRUCTION_LEN);
1032 io->rep_or_str = !!(exitq & 0x30);
1035 void vcpu_vendor_get_mmio_intercept(struct vcpu_mmio_intercept *mmio)
1037 u64 exitq = vmcs_read64(EXIT_QUALIFICATION);
1039 mmio->phys_addr = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
1040 /* We don't enable dirty/accessed bit updated in EPTP,
1041 * so only read of write flags can be set, not both. */
1042 mmio->is_write = !!(exitq & 0x2);
1045 void vcpu_handle_exit(struct per_cpu *cpu_data)
1047 u32 reason = vmcs_read32(VM_EXIT_REASON);
1050 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_TOTAL]++;
1053 case EXIT_REASON_EXCEPTION_NMI:
1054 asm volatile("int %0" : : "i" (NMI_VECTOR));
1056 case EXIT_REASON_PREEMPTION_TIMER:
1057 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_MANAGEMENT]++;
1058 vmx_disable_preemption_timer();
1059 sipi_vector = x86_handle_events(cpu_data);
1060 if (sipi_vector >= 0) {
1061 printk("CPU %d received SIPI, vector %x\n",
1062 cpu_data->cpu_id, sipi_vector);
1063 vmx_vcpu_reset(sipi_vector);
1064 vcpu_reset(sipi_vector == APIC_BSP_PSEUDO_SIPI);
1066 iommu_check_pending_faults();
1068 case EXIT_REASON_CPUID:
1069 vcpu_handle_cpuid();
1071 case EXIT_REASON_VMCALL:
1072 vcpu_handle_hypercall();
1074 case EXIT_REASON_CR_ACCESS:
1075 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_CR]++;
1076 if (vmx_handle_cr())
1079 case EXIT_REASON_MSR_READ:
1080 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_MSR]++;
1081 if (vcpu_handle_msr_read())
1084 case EXIT_REASON_MSR_WRITE:
1085 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_MSR]++;
1086 if (cpu_data->guest_regs.rcx == MSR_IA32_PERF_GLOBAL_CTRL) {
1088 vcpu_skip_emulated_instruction(X86_INST_LEN_WRMSR);
1090 } else if (vcpu_handle_msr_write())
1093 case EXIT_REASON_APIC_ACCESS:
1094 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_XAPIC]++;
1095 if (vmx_handle_apic_access())
1098 case EXIT_REASON_XSETBV:
1099 if (vcpu_handle_xsetbv())
1102 case EXIT_REASON_IO_INSTRUCTION:
1103 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_PIO]++;
1104 if (vcpu_handle_io_access())
1107 case EXIT_REASON_EPT_VIOLATION:
1108 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_MMIO]++;
1109 if (vcpu_handle_mmio_access())
1113 panic_printk("FATAL: %s, reason %d\n",
1114 (reason & EXIT_REASONS_FAILED_VMENTRY) ?
1115 "VM-Entry failure" : "Unhandled VM-Exit",
1117 dump_vm_exit_details(reason);
1120 dump_guest_regs(&cpu_data->guest_regs);
1124 void vmx_entry_failure(void)
1126 panic_printk("FATAL: vmresume failed, error %d\n",
1127 vmcs_read32(VM_INSTRUCTION_ERROR));
1131 void vcpu_vendor_get_cell_io_bitmap(struct cell *cell,
1132 struct vcpu_io_bitmap *iobm)
1134 iobm->data = cell->vmx.io_bitmap;
1135 iobm->size = sizeof(cell->vmx.io_bitmap);
1138 void vcpu_vendor_get_execution_state(struct vcpu_execution_state *x_state)
1140 x_state->efer = vmcs_read64(GUEST_IA32_EFER);
1141 x_state->rflags = vmcs_read64(GUEST_RFLAGS);
1142 x_state->cs = vmcs_read16(GUEST_CS_SELECTOR);
1143 x_state->rip = vmcs_read64(GUEST_RIP);
1146 void enable_irq(void)
1148 asm volatile("sti" : : : "memory");
1151 void disable_irq(void)
1153 asm volatile("cli" : : : "memory");