]> rtime.felk.cvut.cz Git - jailhouse.git/blob - hypervisor/arch/x86/vmx.c
x86: Use more BIT_MASK macro for paging tasks
[jailhouse.git] / hypervisor / arch / x86 / vmx.c
1 /*
2  * Jailhouse, a Linux-based partitioning hypervisor
3  *
4  * Copyright (c) Siemens AG, 2013-2016
5  * Copyright (c) Valentine Sinitsyn, 2014
6  *
7  * Authors:
8  *  Jan Kiszka <jan.kiszka@siemens.com>
9  *  Valentine Sinitsyn <valentine.sinitsyn@gmail.com>
10  *
11  * This work is licensed under the terms of the GNU GPL, version 2.  See
12  * the COPYING file in the top-level directory.
13  */
14
15 #include <jailhouse/entry.h>
16 #include <jailhouse/paging.h>
17 #include <jailhouse/processor.h>
18 #include <jailhouse/printk.h>
19 #include <jailhouse/string.h>
20 #include <jailhouse/control.h>
21 #include <jailhouse/hypercall.h>
22 #include <asm/apic.h>
23 #include <asm/control.h>
24 #include <asm/iommu.h>
25 #include <asm/vcpu.h>
26 #include <asm/vmx.h>
27
28 #define CR0_IDX                 0
29 #define CR4_IDX                 1
30
31 #define PIO_BITMAP_PAGES        2
32
33 static const struct segment invalid_seg = {
34         .access_rights = 0x10000
35 };
36
37 /* bit cleared: direct access allowed */
38 // TODO: convert to whitelist
39 static u8 __attribute__((aligned(PAGE_SIZE))) msr_bitmap[][0x2000/8] = {
40         [ VMX_MSR_BMP_0000_READ ] = {
41                 [      0/8 ...  0x26f/8 ] = 0,
42                 [  0x270/8 ...  0x277/8 ] = 0x80, /* 0x277 */
43                 [  0x278/8 ...  0x2f7/8 ] = 0,
44                 [  0x2f8/8 ...  0x2ff/8 ] = 0x80, /* 0x2ff */
45                 [  0x300/8 ...  0x7ff/8 ] = 0,
46                 [  0x800/8 ...  0x807/8 ] = 0x0c, /* 0x802, 0x803 */
47                 [  0x808/8 ...  0x80f/8 ] = 0xa5, /* 0x808, 0x80a, 0x80d, 0x80f */
48                 [  0x810/8 ...  0x817/8 ] = 0xff, /* 0x810 - 0x817 */
49                 [  0x818/8 ...  0x81f/8 ] = 0xff, /* 0x818 - 0x81f */
50                 [  0x820/8 ...  0x827/8 ] = 0xff, /* 0x820 - 0x827 */
51                 [  0x828/8 ...  0x82f/8 ] = 0x81, /* 0x828, 0x82f */
52                 [  0x830/8 ...  0x837/8 ] = 0xfd, /* 0x830, 0x832 - 0x837 */
53                 [  0x838/8 ...  0x83f/8 ] = 0x43, /* 0x838, 0x839, 0x83e */
54                 [  0x840/8 ... 0x1fff/8 ] = 0,
55         },
56         [ VMX_MSR_BMP_C000_READ ] = {
57                 [      0/8 ... 0x1fff/8 ] = 0,
58         },
59         [ VMX_MSR_BMP_0000_WRITE ] = {
60                 [      0/8 ...   0x17/8 ] = 0,
61                 [   0x18/8 ...   0x1f/8 ] = 0x08, /* 0x01b */
62                 [   0x20/8 ...  0x1ff/8 ] = 0,
63                 [  0x200/8 ...  0x277/8 ] = 0xff, /* 0x200 - 0x277 */
64                 [  0x278/8 ...  0x2f7/8 ] = 0,
65                 [  0x2f8/8 ...  0x2ff/8 ] = 0x80, /* 0x2ff */
66                 [  0x300/8 ...  0x387/8 ] = 0,
67                 [  0x388/8 ...  0x38f/8 ] = 0x80, /* 0x38f */
68                 [  0x390/8 ...  0x7ff/8 ] = 0,
69                 [  0x808/8 ...  0x80f/8 ] = 0x89, /* 0x808, 0x80b, 0x80f */
70                 [  0x810/8 ...  0x827/8 ] = 0,
71                 [  0x828/8 ...  0x82f/8 ] = 0x81, /* 0x828, 0x82f */
72                 [  0x830/8 ...  0x837/8 ] = 0xfd, /* 0x830, 0x832 - 0x837 */
73                 [  0x838/8 ...  0x83f/8 ] = 0xc1, /* 0x838, 0x83e, 0x83f */
74                 [  0x840/8 ...  0xd8f/8 ] = 0xff, /* esp. 0xc80 - 0xd8f */
75                 [  0xd90/8 ... 0x1fff/8 ] = 0,
76         },
77         [ VMX_MSR_BMP_C000_WRITE ] = {
78                 [      0/8 ... 0x1fff/8 ] = 0,
79         },
80 };
81 static u8 __attribute__((aligned(PAGE_SIZE))) apic_access_page[PAGE_SIZE];
82 static struct paging ept_paging[EPT_PAGE_DIR_LEVELS];
83 static u32 enable_rdtscp;
84 static unsigned long cr_maybe1[2], cr_required1[2];
85
86 static bool vmxon(struct per_cpu *cpu_data)
87 {
88         unsigned long vmxon_addr;
89         u8 ok;
90
91         vmxon_addr = paging_hvirt2phys(&cpu_data->vmxon_region);
92         asm volatile(
93                 "vmxon (%1)\n\t"
94                 "seta %0"
95                 : "=rm" (ok)
96                 : "r" (&vmxon_addr), "m" (vmxon_addr)
97                 : "memory", "cc");
98         return ok;
99 }
100
101 static bool vmcs_clear(struct per_cpu *cpu_data)
102 {
103         unsigned long vmcs_addr = paging_hvirt2phys(&cpu_data->vmcs);
104         u8 ok;
105
106         asm volatile(
107                 "vmclear (%1)\n\t"
108                 "seta %0"
109                 : "=qm" (ok)
110                 : "r" (&vmcs_addr), "m" (vmcs_addr)
111                 : "memory", "cc");
112         return ok;
113 }
114
115 static bool vmcs_load(struct per_cpu *cpu_data)
116 {
117         unsigned long vmcs_addr = paging_hvirt2phys(&cpu_data->vmcs);
118         u8 ok;
119
120         asm volatile(
121                 "vmptrld (%1)\n\t"
122                 "seta %0"
123                 : "=qm" (ok)
124                 : "r" (&vmcs_addr), "m" (vmcs_addr)
125                 : "memory", "cc");
126         return ok;
127 }
128
129 static inline unsigned long vmcs_read64(unsigned long field)
130 {
131         unsigned long value;
132
133         asm volatile("vmread %1,%0" : "=r" (value) : "r" (field) : "cc");
134         return value;
135 }
136
137 static inline u16 vmcs_read16(unsigned long field)
138 {
139         return vmcs_read64(field);
140 }
141
142 static inline u32 vmcs_read32(unsigned long field)
143 {
144         return vmcs_read64(field);
145 }
146
147 static bool vmcs_write64(unsigned long field, unsigned long val)
148 {
149         u8 ok;
150
151         asm volatile(
152                 "vmwrite %1,%2\n\t"
153                 "setnz %0"
154                 : "=qm" (ok)
155                 : "r" (val), "r" (field)
156                 : "cc");
157         if (!ok)
158                 printk("FATAL: vmwrite %08lx failed, error %d, caller %p\n",
159                        field, vmcs_read32(VM_INSTRUCTION_ERROR),
160                        __builtin_return_address(0));
161         return ok;
162 }
163
164 static bool vmcs_write16(unsigned long field, u16 value)
165 {
166         return vmcs_write64(field, value);
167 }
168
169 static bool vmcs_write32(unsigned long field, u32 value)
170 {
171         return vmcs_write64(field, value);
172 }
173
174 static bool vmx_define_cr_restrictions(unsigned int cr_idx,
175                                        unsigned long maybe1,
176                                        unsigned long required1)
177 {
178         if (!cr_maybe1[cr_idx]) {
179                 cr_maybe1[cr_idx] = maybe1;
180                 cr_required1[cr_idx] = required1;
181                 return true;
182         }
183
184         return cr_maybe1[cr_idx] == maybe1 &&
185                 cr_required1[cr_idx] == required1;
186 }
187
188 static int vmx_check_features(void)
189 {
190         unsigned long vmx_proc_ctrl, vmx_proc_ctrl2, ept_cap;
191         unsigned long vmx_pin_ctrl, vmx_basic, maybe1, required1;
192         unsigned long vmx_entry_ctrl, vmx_exit_ctrl;
193
194         if (!(cpuid_ecx(1, 0) & X86_FEATURE_VMX))
195                 return trace_error(-ENODEV);
196
197         vmx_basic = read_msr(MSR_IA32_VMX_BASIC);
198
199         /* require VMCS size <= PAGE_SIZE,
200          * VMCS memory access type == write back and
201          * availability of TRUE_*_CTLS */
202         if (((vmx_basic >> 32) & 0x1fff) > PAGE_SIZE ||
203             ((vmx_basic >> 50) & 0xf) != EPT_TYPE_WRITEBACK ||
204             !(vmx_basic & (1UL << 55)))
205                 return trace_error(-EIO);
206
207         /* require NMI exiting and preemption timer support */
208         vmx_pin_ctrl = read_msr(MSR_IA32_VMX_PINBASED_CTLS) >> 32;
209         if (!(vmx_pin_ctrl & PIN_BASED_NMI_EXITING) ||
210             !(vmx_pin_ctrl & PIN_BASED_VMX_PREEMPTION_TIMER))
211                 return trace_error(-EIO);
212
213         /* require I/O and MSR bitmap as well as secondary controls support */
214         vmx_proc_ctrl = read_msr(MSR_IA32_VMX_PROCBASED_CTLS) >> 32;
215         if (!(vmx_proc_ctrl & CPU_BASED_USE_IO_BITMAPS) ||
216             !(vmx_proc_ctrl & CPU_BASED_USE_MSR_BITMAPS) ||
217             !(vmx_proc_ctrl & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
218                 return trace_error(-EIO);
219
220         /* require disabling of CR3 access interception */
221         vmx_proc_ctrl = read_msr(MSR_IA32_VMX_TRUE_PROCBASED_CTLS);
222         if (vmx_proc_ctrl &
223             (CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING))
224                 return trace_error(-EIO);
225
226         /* require APIC access, EPT and unrestricted guest mode support */
227         vmx_proc_ctrl2 = read_msr(MSR_IA32_VMX_PROCBASED_CTLS2) >> 32;
228         ept_cap = read_msr(MSR_IA32_VMX_EPT_VPID_CAP);
229         if (!(vmx_proc_ctrl2 & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) ||
230             !(vmx_proc_ctrl2 & SECONDARY_EXEC_ENABLE_EPT) ||
231             (ept_cap & EPT_MANDATORY_FEATURES) != EPT_MANDATORY_FEATURES ||
232             !(ept_cap & (EPT_INVEPT_SINGLE | EPT_INVEPT_GLOBAL)) ||
233             !(vmx_proc_ctrl2 & SECONDARY_EXEC_UNRESTRICTED_GUEST))
234                 return trace_error(-EIO);
235
236         /* require RDTSCP if present in CPUID */
237         if (cpuid_edx(0x80000001, 0) & X86_FEATURE_RDTSCP) {
238                 enable_rdtscp = SECONDARY_EXEC_RDTSCP;
239                 if (!(vmx_proc_ctrl2 & SECONDARY_EXEC_RDTSCP))
240                         return trace_error(-EIO);
241         }
242
243         /* require PAT and EFER save/restore */
244         vmx_entry_ctrl = read_msr(MSR_IA32_VMX_ENTRY_CTLS) >> 32;
245         vmx_exit_ctrl = read_msr(MSR_IA32_VMX_EXIT_CTLS) >> 32;
246         if (!(vmx_entry_ctrl & VM_ENTRY_LOAD_IA32_PAT) ||
247             !(vmx_entry_ctrl & VM_ENTRY_LOAD_IA32_EFER) ||
248             !(vmx_exit_ctrl & VM_EXIT_SAVE_IA32_PAT) ||
249             !(vmx_exit_ctrl & VM_EXIT_LOAD_IA32_PAT) ||
250             !(vmx_exit_ctrl & VM_EXIT_SAVE_IA32_EFER) ||
251             !(vmx_exit_ctrl & VM_EXIT_LOAD_IA32_EFER))
252                 return trace_error(-EIO);
253
254         /* require activity state HLT */
255         if (!(read_msr(MSR_IA32_VMX_MISC) & VMX_MISC_ACTIVITY_HLT))
256                 return trace_error(-EIO);
257
258         /*
259          * Retrieve/validate restrictions on CR0
260          *
261          * In addition to what the VMX MSRs tell us, make sure that
262          * - NW and CD are kept off as they are not updated on VM exit and we
263          *   don't want them enabled for performance reasons while in root mode
264          * - PE and PG can be freely chosen (by the guest) because we demand
265          *   unrestricted guest mode support anyway
266          * - ET is always on (architectural requirement)
267          */
268         maybe1 = read_msr(MSR_IA32_VMX_CR0_FIXED1) &
269                 ~(X86_CR0_NW | X86_CR0_CD);
270         required1 = (read_msr(MSR_IA32_VMX_CR0_FIXED0) &
271                 ~(X86_CR0_PE | X86_CR0_PG)) | X86_CR0_ET;
272         if (!vmx_define_cr_restrictions(CR0_IDX, maybe1, required1))
273                 return trace_error(-EIO);
274
275         /* Retrieve/validate restrictions on CR4 */
276         maybe1 = read_msr(MSR_IA32_VMX_CR4_FIXED1);
277         required1 = read_msr(MSR_IA32_VMX_CR4_FIXED0);
278         if (!vmx_define_cr_restrictions(CR4_IDX, maybe1, required1))
279                 return trace_error(-EIO);
280
281         return 0;
282 }
283
284 static void ept_set_next_pt(pt_entry_t pte, unsigned long next_pt)
285 {
286         *pte = (next_pt & BIT_MASK(51, 12)) | EPT_FLAG_READ | EPT_FLAG_WRITE |
287                 EPT_FLAG_EXECUTE;
288 }
289
290 int vcpu_vendor_init(void)
291 {
292         unsigned int n;
293         int err;
294
295         err = vmx_check_features();
296         if (err)
297                 return err;
298
299         /* derive ept_paging from very similar x86_64_paging */
300         memcpy(ept_paging, x86_64_paging, sizeof(ept_paging));
301         for (n = 0; n < EPT_PAGE_DIR_LEVELS; n++)
302                 ept_paging[n].set_next_pt = ept_set_next_pt;
303         if (!(read_msr(MSR_IA32_VMX_EPT_VPID_CAP) & EPT_1G_PAGES))
304                 ept_paging[1].page_size = 0;
305         if (!(read_msr(MSR_IA32_VMX_EPT_VPID_CAP) & EPT_2M_PAGES))
306                 ept_paging[2].page_size = 0;
307
308         if (using_x2apic) {
309                 /* allow direct x2APIC access except for ICR writes */
310                 memset(&msr_bitmap[VMX_MSR_BMP_0000_READ][MSR_X2APIC_BASE/8],
311                        0, (MSR_X2APIC_END - MSR_X2APIC_BASE + 1)/8);
312                 memset(&msr_bitmap[VMX_MSR_BMP_0000_WRITE][MSR_X2APIC_BASE/8],
313                        0, (MSR_X2APIC_END - MSR_X2APIC_BASE + 1)/8);
314                 msr_bitmap[VMX_MSR_BMP_0000_WRITE][MSR_X2APIC_ICR/8] = 0x01;
315         }
316
317         return vcpu_cell_init(&root_cell);
318 }
319
320 unsigned long arch_paging_gphys2phys(struct per_cpu *cpu_data,
321                                      unsigned long gphys, unsigned long flags)
322 {
323         return paging_virt2phys(&cpu_data->cell->arch.vmx.ept_structs, gphys,
324                                 flags);
325 }
326
327 int vcpu_vendor_cell_init(struct cell *cell)
328 {
329         int err;
330
331         /* allocate io_bitmap */
332         cell->arch.vmx.io_bitmap = page_alloc(&mem_pool, PIO_BITMAP_PAGES);
333         if (!cell->arch.vmx.io_bitmap)
334                 return -ENOMEM;
335
336         /* build root EPT of cell */
337         cell->arch.vmx.ept_structs.root_paging = ept_paging;
338         cell->arch.vmx.ept_structs.root_table =
339                 (page_table_t)cell->arch.root_table_page;
340
341         err = paging_create(&cell->arch.vmx.ept_structs,
342                             paging_hvirt2phys(apic_access_page),
343                             PAGE_SIZE, XAPIC_BASE,
344                             EPT_FLAG_READ | EPT_FLAG_WRITE | EPT_FLAG_WB_TYPE,
345                             PAGING_NON_COHERENT);
346         if (err)
347                 goto err_free_io_bitmap;
348
349         return 0;
350
351 err_free_io_bitmap:
352         page_free(&mem_pool, cell->arch.vmx.io_bitmap, 2);
353
354         return err;
355 }
356
357 int vcpu_map_memory_region(struct cell *cell,
358                            const struct jailhouse_memory *mem)
359 {
360         u64 phys_start = mem->phys_start;
361         u32 flags = EPT_FLAG_WB_TYPE;
362
363         if (mem->flags & JAILHOUSE_MEM_READ)
364                 flags |= EPT_FLAG_READ;
365         if (mem->flags & JAILHOUSE_MEM_WRITE)
366                 flags |= EPT_FLAG_WRITE;
367         if (mem->flags & JAILHOUSE_MEM_EXECUTE)
368                 flags |= EPT_FLAG_EXECUTE;
369         if (mem->flags & JAILHOUSE_MEM_COMM_REGION)
370                 phys_start = paging_hvirt2phys(&cell->comm_page);
371
372         return paging_create(&cell->arch.vmx.ept_structs, phys_start, mem->size,
373                              mem->virt_start, flags, PAGING_NON_COHERENT);
374 }
375
376 int vcpu_unmap_memory_region(struct cell *cell,
377                              const struct jailhouse_memory *mem)
378 {
379         return paging_destroy(&cell->arch.vmx.ept_structs, mem->virt_start,
380                               mem->size, PAGING_NON_COHERENT);
381 }
382
383 void vcpu_vendor_cell_exit(struct cell *cell)
384 {
385         paging_destroy(&cell->arch.vmx.ept_structs, XAPIC_BASE, PAGE_SIZE,
386                        PAGING_NON_COHERENT);
387         page_free(&mem_pool, cell->arch.vmx.io_bitmap, 2);
388 }
389
390 void vcpu_tlb_flush(void)
391 {
392         unsigned long ept_cap = read_msr(MSR_IA32_VMX_EPT_VPID_CAP);
393         struct {
394                 u64 eptp;
395                 u64 reserved;
396         } descriptor;
397         u64 type;
398         u8 ok;
399
400         descriptor.reserved = 0;
401         if (ept_cap & EPT_INVEPT_SINGLE) {
402                 type = VMX_INVEPT_SINGLE;
403                 descriptor.eptp = vmcs_read64(EPT_POINTER);
404         } else {
405                 type = VMX_INVEPT_GLOBAL;
406                 descriptor.eptp = 0;
407         }
408         asm volatile(
409                 "invept (%1),%2\n\t"
410                 "seta %0\n\t"
411                 : "=qm" (ok)
412                 : "r" (&descriptor), "r" (type)
413                 : "memory", "cc");
414
415         if (!ok) {
416                 panic_printk("FATAL: invept failed, error %d\n",
417                              vmcs_read32(VM_INSTRUCTION_ERROR));
418                 panic_stop();
419         }
420 }
421
422 static bool vmx_set_guest_cr(unsigned int cr_idx, unsigned long val)
423 {
424         bool ok = true;
425
426         if (cr_idx)
427                 val |= X86_CR4_VMXE; /* keeps the hypervisor visible */
428
429         ok &= vmcs_write64(cr_idx ? GUEST_CR4 : GUEST_CR0,
430                            (val & cr_maybe1[cr_idx]) | cr_required1[cr_idx]);
431         ok &= vmcs_write64(cr_idx ? CR4_READ_SHADOW : CR0_READ_SHADOW, val);
432         ok &= vmcs_write64(cr_idx ? CR4_GUEST_HOST_MASK : CR0_GUEST_HOST_MASK,
433                            cr_required1[cr_idx] | ~cr_maybe1[cr_idx]);
434
435         return ok;
436 }
437
438 static bool vmx_set_cell_config(void)
439 {
440         struct cell *cell = this_cell();
441         u8 *io_bitmap;
442         bool ok = true;
443
444         io_bitmap = cell->arch.vmx.io_bitmap;
445         ok &= vmcs_write64(IO_BITMAP_A, paging_hvirt2phys(io_bitmap));
446         ok &= vmcs_write64(IO_BITMAP_B,
447                            paging_hvirt2phys(io_bitmap + PAGE_SIZE));
448
449         ok &= vmcs_write64(EPT_POINTER,
450                 paging_hvirt2phys(cell->arch.vmx.ept_structs.root_table) |
451                 EPT_TYPE_WRITEBACK | EPT_PAGE_WALK_LEN);
452
453         return ok;
454 }
455
456 static bool vmx_set_guest_segment(const struct segment *seg,
457                                   unsigned long selector_field)
458 {
459         bool ok = true;
460
461         ok &= vmcs_write16(selector_field, seg->selector);
462         ok &= vmcs_write64(selector_field + GUEST_SEG_BASE, seg->base);
463         ok &= vmcs_write32(selector_field + GUEST_SEG_LIMIT, seg->limit);
464         ok &= vmcs_write32(selector_field + GUEST_SEG_AR_BYTES,
465                            seg->access_rights);
466         return ok;
467 }
468
469 static bool vmcs_setup(struct per_cpu *cpu_data)
470 {
471         struct desc_table_reg dtr;
472         unsigned long val;
473         bool ok = true;
474
475         ok &= vmcs_write64(HOST_CR0, read_cr0());
476         ok &= vmcs_write64(HOST_CR3, read_cr3());
477         ok &= vmcs_write64(HOST_CR4, read_cr4());
478
479         ok &= vmcs_write16(HOST_CS_SELECTOR, GDT_DESC_CODE * 8);
480         ok &= vmcs_write16(HOST_DS_SELECTOR, 0);
481         ok &= vmcs_write16(HOST_ES_SELECTOR, 0);
482         ok &= vmcs_write16(HOST_SS_SELECTOR, 0);
483         ok &= vmcs_write16(HOST_FS_SELECTOR, 0);
484         ok &= vmcs_write16(HOST_GS_SELECTOR, 0);
485         ok &= vmcs_write16(HOST_TR_SELECTOR, GDT_DESC_TSS * 8);
486
487         ok &= vmcs_write64(HOST_FS_BASE, 0);
488         ok &= vmcs_write64(HOST_GS_BASE, read_msr(MSR_GS_BASE));
489         ok &= vmcs_write64(HOST_TR_BASE, 0);
490
491         read_gdtr(&dtr);
492         ok &= vmcs_write64(HOST_GDTR_BASE, dtr.base);
493         read_idtr(&dtr);
494         ok &= vmcs_write64(HOST_IDTR_BASE, dtr.base);
495
496         ok &= vmcs_write64(HOST_IA32_PAT, read_msr(MSR_IA32_PAT));
497         ok &= vmcs_write64(HOST_IA32_EFER, EFER_LMA | EFER_LME);
498
499         ok &= vmcs_write32(HOST_IA32_SYSENTER_CS, 0);
500         ok &= vmcs_write64(HOST_IA32_SYSENTER_EIP, 0);
501         ok &= vmcs_write64(HOST_IA32_SYSENTER_ESP, 0);
502
503         ok &= vmcs_write64(HOST_RSP, (unsigned long)cpu_data->stack +
504                            sizeof(cpu_data->stack));
505         ok &= vmcs_write64(HOST_RIP, (unsigned long)vmx_vmexit);
506
507         ok &= vmx_set_guest_cr(CR0_IDX, cpu_data->linux_cr0);
508         ok &= vmx_set_guest_cr(CR4_IDX, cpu_data->linux_cr4);
509
510         ok &= vmcs_write64(GUEST_CR3, cpu_data->linux_cr3);
511
512         ok &= vmx_set_guest_segment(&cpu_data->linux_cs, GUEST_CS_SELECTOR);
513         ok &= vmx_set_guest_segment(&cpu_data->linux_ds, GUEST_DS_SELECTOR);
514         ok &= vmx_set_guest_segment(&cpu_data->linux_es, GUEST_ES_SELECTOR);
515         ok &= vmx_set_guest_segment(&cpu_data->linux_fs, GUEST_FS_SELECTOR);
516         ok &= vmx_set_guest_segment(&cpu_data->linux_gs, GUEST_GS_SELECTOR);
517         ok &= vmx_set_guest_segment(&invalid_seg, GUEST_SS_SELECTOR);
518         ok &= vmx_set_guest_segment(&cpu_data->linux_tss, GUEST_TR_SELECTOR);
519         ok &= vmx_set_guest_segment(&invalid_seg, GUEST_LDTR_SELECTOR);
520
521         ok &= vmcs_write64(GUEST_GDTR_BASE, cpu_data->linux_gdtr.base);
522         ok &= vmcs_write32(GUEST_GDTR_LIMIT, cpu_data->linux_gdtr.limit);
523         ok &= vmcs_write64(GUEST_IDTR_BASE, cpu_data->linux_idtr.base);
524         ok &= vmcs_write32(GUEST_IDTR_LIMIT, cpu_data->linux_idtr.limit);
525
526         ok &= vmcs_write64(GUEST_RFLAGS, 0x02);
527         ok &= vmcs_write64(GUEST_RSP, cpu_data->linux_sp +
528                            (NUM_ENTRY_REGS + 1) * sizeof(unsigned long));
529         ok &= vmcs_write64(GUEST_RIP, cpu_data->linux_ip);
530
531         ok &= vmcs_write32(GUEST_SYSENTER_CS,
532                            read_msr(MSR_IA32_SYSENTER_CS));
533         ok &= vmcs_write64(GUEST_SYSENTER_EIP,
534                            read_msr(MSR_IA32_SYSENTER_EIP));
535         ok &= vmcs_write64(GUEST_SYSENTER_ESP,
536                            read_msr(MSR_IA32_SYSENTER_ESP));
537
538         ok &= vmcs_write64(GUEST_DR7, 0x00000400);
539         ok &= vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
540
541         ok &= vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
542         ok &= vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
543         ok &= vmcs_write64(GUEST_PENDING_DBG_EXCEPTIONS, 0);
544
545         ok &= vmcs_write64(GUEST_IA32_PAT, cpu_data->pat);
546         ok &= vmcs_write64(GUEST_IA32_EFER, cpu_data->linux_efer);
547
548         ok &= vmcs_write64(VMCS_LINK_POINTER, -1UL);
549         ok &= vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
550
551         val = read_msr(MSR_IA32_VMX_PINBASED_CTLS);
552         val |= PIN_BASED_NMI_EXITING;
553         ok &= vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, val);
554
555         ok &= vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0);
556
557         val = read_msr(MSR_IA32_VMX_PROCBASED_CTLS);
558         val |= CPU_BASED_USE_IO_BITMAPS | CPU_BASED_USE_MSR_BITMAPS |
559                 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
560         val &= ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
561         ok &= vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, val);
562
563         ok &= vmcs_write64(MSR_BITMAP, paging_hvirt2phys(msr_bitmap));
564
565         val = read_msr(MSR_IA32_VMX_PROCBASED_CTLS2);
566         val |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
567                 SECONDARY_EXEC_ENABLE_EPT | SECONDARY_EXEC_UNRESTRICTED_GUEST |
568                 enable_rdtscp;
569         ok &= vmcs_write32(SECONDARY_VM_EXEC_CONTROL, val);
570
571         ok &= vmcs_write64(APIC_ACCESS_ADDR,
572                            paging_hvirt2phys(apic_access_page));
573
574         ok &= vmx_set_cell_config();
575
576         /* see vmx_handle_exception_nmi for the interception reason */
577         ok &= vmcs_write32(EXCEPTION_BITMAP,
578                            (1 << DB_VECTOR) | (1 << AC_VECTOR));
579
580         val = read_msr(MSR_IA32_VMX_EXIT_CTLS);
581         val |= VM_EXIT_HOST_ADDR_SPACE_SIZE |
582                 VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT |
583                 VM_EXIT_SAVE_IA32_EFER | VM_EXIT_LOAD_IA32_EFER;
584         ok &= vmcs_write32(VM_EXIT_CONTROLS, val);
585
586         ok &= vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
587         ok &= vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
588         ok &= vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
589
590         val = read_msr(MSR_IA32_VMX_ENTRY_CTLS);
591         val |= VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_PAT |
592                 VM_ENTRY_LOAD_IA32_EFER;
593         ok &= vmcs_write32(VM_ENTRY_CONTROLS, val);
594
595         ok &= vmcs_write64(CR4_GUEST_HOST_MASK, 0);
596
597         ok &= vmcs_write32(CR3_TARGET_COUNT, 0);
598
599         return ok;
600 }
601
602 int vcpu_init(struct per_cpu *cpu_data)
603 {
604         unsigned long feature_ctrl, mask;
605         u32 revision_id;
606         int err;
607
608         /* make sure all perf counters are off */
609         if ((cpuid_eax(0x0a, 0) & 0xff) > 0)
610                 write_msr(MSR_IA32_PERF_GLOBAL_CTRL, 0);
611
612         if (cpu_data->linux_cr4 & X86_CR4_VMXE)
613                 return trace_error(-EBUSY);
614
615         err = vmx_check_features();
616         if (err)
617                 return err;
618
619         revision_id = (u32)read_msr(MSR_IA32_VMX_BASIC);
620         cpu_data->vmxon_region.revision_id = revision_id;
621         cpu_data->vmxon_region.shadow_indicator = 0;
622         cpu_data->vmcs.revision_id = revision_id;
623         cpu_data->vmcs.shadow_indicator = 0;
624
625         /* Note: We assume that TXT is off */
626         feature_ctrl = read_msr(MSR_IA32_FEATURE_CONTROL);
627         mask = FEATURE_CONTROL_LOCKED |
628                 FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
629
630         if ((feature_ctrl & mask) != mask) {
631                 if (feature_ctrl & FEATURE_CONTROL_LOCKED)
632                         return trace_error(-ENODEV);
633
634                 feature_ctrl |= mask;
635                 write_msr(MSR_IA32_FEATURE_CONTROL, feature_ctrl);
636         }
637
638         /*
639          * SDM Volume 3, 2.5: "When loading a control register, reserved bits
640          * should always be set to the values previously read."
641          * But we want to avoid surprises with new features unknown to us but
642          * set by Linux. So check if any assumed revered bit was set or should
643          * be set for VMX operation and bail out if so.
644          */
645         if ((cpu_data->linux_cr0 | cr_required1[CR0_IDX]) & X86_CR0_RESERVED ||
646             (cpu_data->linux_cr4 | cr_required1[CR4_IDX]) & X86_CR4_RESERVED)
647                 return -EIO;
648         /*
649          * Bring CR0 and CR4 into well-defined states. If they do not match
650          * with VMX requirements, vmxon will fail.
651          * X86_CR4_OSXSAVE is enabled if available so that xsetbv can be
652          * executed on behalf of a cell.
653          */
654         write_cr0(X86_CR0_HOST_STATE);
655         write_cr4(X86_CR4_HOST_STATE | X86_CR4_VMXE |
656                   ((cpuid_ecx(1, 0) & X86_FEATURE_XSAVE) ?
657                    X86_CR4_OSXSAVE : 0));
658
659         if (!vmxon(cpu_data))  {
660                 write_cr4(cpu_data->linux_cr4);
661                 return trace_error(-EIO);
662         }
663
664         cpu_data->vmx_state = VMXON;
665
666         if (!vmcs_clear(cpu_data) ||
667             !vmcs_load(cpu_data) ||
668             !vmcs_setup(cpu_data))
669                 return trace_error(-EIO);
670
671         cpu_data->vmx_state = VMCS_READY;
672
673         return 0;
674 }
675
676 void vcpu_exit(struct per_cpu *cpu_data)
677 {
678         if (cpu_data->vmx_state == VMXOFF)
679                 return;
680
681         cpu_data->vmx_state = VMXOFF;
682         /* Write vmx_state to ensure that vcpu_nmi_handler stops accessing
683          * the VMCS (a compiler barrier would be sufficient, in fact). */
684         memory_barrier();
685
686         vmcs_clear(cpu_data);
687         asm volatile("vmxoff" : : : "cc");
688         cpu_data->linux_cr4 &= ~X86_CR4_VMXE;
689 }
690
691 void __attribute__((noreturn)) vcpu_activate_vmm(struct per_cpu *cpu_data)
692 {
693         /* We enter Linux at the point arch_entry would return to as well.
694          * rax is cleared to signal success to the caller. */
695         asm volatile(
696                 "mov (%%rdi),%%r15\n\t"
697                 "mov 0x8(%%rdi),%%r14\n\t"
698                 "mov 0x10(%%rdi),%%r13\n\t"
699                 "mov 0x18(%%rdi),%%r12\n\t"
700                 "mov 0x20(%%rdi),%%rbx\n\t"
701                 "mov 0x28(%%rdi),%%rbp\n\t"
702                 "vmlaunch\n\t"
703                 "pop %%rbp"
704                 : /* no output */
705                 : "a" (0), "D" (cpu_data->linux_reg)
706                 : "memory", "r15", "r14", "r13", "r12", "rbx", "rbp", "cc");
707
708         panic_printk("FATAL: vmlaunch failed, error %d\n",
709                      vmcs_read32(VM_INSTRUCTION_ERROR));
710         panic_stop();
711 }
712
713 void __attribute__((noreturn)) vcpu_deactivate_vmm(void)
714 {
715         unsigned long *stack = (unsigned long *)vmcs_read64(GUEST_RSP);
716         unsigned long linux_ip = vmcs_read64(GUEST_RIP);
717         struct per_cpu *cpu_data = this_cpu_data();
718
719         cpu_data->linux_cr0 = vmcs_read64(GUEST_CR0);
720         cpu_data->linux_cr3 = vmcs_read64(GUEST_CR3);
721         cpu_data->linux_cr4 = vmcs_read64(GUEST_CR4);
722
723         cpu_data->linux_gdtr.base = vmcs_read64(GUEST_GDTR_BASE);
724         cpu_data->linux_gdtr.limit = vmcs_read64(GUEST_GDTR_LIMIT);
725         cpu_data->linux_idtr.base = vmcs_read64(GUEST_IDTR_BASE);
726         cpu_data->linux_idtr.limit = vmcs_read64(GUEST_IDTR_LIMIT);
727
728         cpu_data->linux_cs.selector = vmcs_read32(GUEST_CS_SELECTOR);
729
730         cpu_data->linux_tss.selector = vmcs_read32(GUEST_TR_SELECTOR);
731
732         cpu_data->linux_efer = vmcs_read64(GUEST_IA32_EFER);
733         cpu_data->linux_fs.base = vmcs_read64(GUEST_FS_BASE);
734         cpu_data->linux_gs.base = vmcs_read64(GUEST_GS_BASE);
735
736         write_msr(MSR_IA32_SYSENTER_CS, vmcs_read32(GUEST_SYSENTER_CS));
737         write_msr(MSR_IA32_SYSENTER_EIP, vmcs_read64(GUEST_SYSENTER_EIP));
738         write_msr(MSR_IA32_SYSENTER_ESP, vmcs_read64(GUEST_SYSENTER_ESP));
739
740         cpu_data->linux_ds.selector = vmcs_read16(GUEST_DS_SELECTOR);
741         cpu_data->linux_es.selector = vmcs_read16(GUEST_ES_SELECTOR);
742         cpu_data->linux_fs.selector = vmcs_read16(GUEST_FS_SELECTOR);
743         cpu_data->linux_gs.selector = vmcs_read16(GUEST_GS_SELECTOR);
744
745         arch_cpu_restore(cpu_data, 0);
746
747         stack--;
748         *stack = linux_ip;
749
750         asm volatile (
751                 "mov %%rbx,%%rsp\n\t"
752                 "pop %%r15\n\t"
753                 "pop %%r14\n\t"
754                 "pop %%r13\n\t"
755                 "pop %%r12\n\t"
756                 "pop %%r11\n\t"
757                 "pop %%r10\n\t"
758                 "pop %%r9\n\t"
759                 "pop %%r8\n\t"
760                 "pop %%rdi\n\t"
761                 "pop %%rsi\n\t"
762                 "pop %%rbp\n\t"
763                 "add $8,%%rsp\n\t"
764                 "pop %%rbx\n\t"
765                 "pop %%rdx\n\t"
766                 "pop %%rcx\n\t"
767                 "mov %%rax,%%rsp\n\t"
768                 "xor %%rax,%%rax\n\t"
769                 "ret"
770                 : : "a" (stack), "b" (&cpu_data->guest_regs));
771         __builtin_unreachable();
772 }
773
774 void vcpu_vendor_reset(unsigned int sipi_vector)
775 {
776         unsigned long val;
777         bool ok = true;
778
779         ok &= vmx_set_guest_cr(CR0_IDX, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
780         ok &= vmx_set_guest_cr(CR4_IDX, 0);
781
782         ok &= vmcs_write64(GUEST_CR3, 0);
783
784         ok &= vmcs_write64(GUEST_RFLAGS, 0x02);
785         ok &= vmcs_write64(GUEST_RSP, 0);
786
787         val = 0;
788         if (sipi_vector == APIC_BSP_PSEUDO_SIPI) {
789                 val = 0xfff0;
790                 sipi_vector = 0xf0;
791
792                 /* only cleared on hard reset */
793                 ok &= vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
794         }
795         ok &= vmcs_write64(GUEST_RIP, val);
796
797         ok &= vmcs_write16(GUEST_CS_SELECTOR, sipi_vector << 8);
798         ok &= vmcs_write64(GUEST_CS_BASE, sipi_vector << 12);
799         ok &= vmcs_write32(GUEST_CS_LIMIT, 0xffff);
800         ok &= vmcs_write32(GUEST_CS_AR_BYTES, 0x0009b);
801
802         ok &= vmcs_write16(GUEST_DS_SELECTOR, 0);
803         ok &= vmcs_write64(GUEST_DS_BASE, 0);
804         ok &= vmcs_write32(GUEST_DS_LIMIT, 0xffff);
805         ok &= vmcs_write32(GUEST_DS_AR_BYTES, 0x00093);
806
807         ok &= vmcs_write16(GUEST_ES_SELECTOR, 0);
808         ok &= vmcs_write64(GUEST_ES_BASE, 0);
809         ok &= vmcs_write32(GUEST_ES_LIMIT, 0xffff);
810         ok &= vmcs_write32(GUEST_ES_AR_BYTES, 0x00093);
811
812         ok &= vmcs_write16(GUEST_FS_SELECTOR, 0);
813         ok &= vmcs_write64(GUEST_FS_BASE, 0);
814         ok &= vmcs_write32(GUEST_FS_LIMIT, 0xffff);
815         ok &= vmcs_write32(GUEST_FS_AR_BYTES, 0x00093);
816
817         ok &= vmcs_write16(GUEST_GS_SELECTOR, 0);
818         ok &= vmcs_write64(GUEST_GS_BASE, 0);
819         ok &= vmcs_write32(GUEST_GS_LIMIT, 0xffff);
820         ok &= vmcs_write32(GUEST_GS_AR_BYTES, 0x00093);
821
822         ok &= vmcs_write16(GUEST_SS_SELECTOR, 0);
823         ok &= vmcs_write64(GUEST_SS_BASE, 0);
824         ok &= vmcs_write32(GUEST_SS_LIMIT, 0xffff);
825         ok &= vmcs_write32(GUEST_SS_AR_BYTES, 0x00093);
826
827         ok &= vmcs_write16(GUEST_TR_SELECTOR, 0);
828         ok &= vmcs_write64(GUEST_TR_BASE, 0);
829         ok &= vmcs_write32(GUEST_TR_LIMIT, 0xffff);
830         ok &= vmcs_write32(GUEST_TR_AR_BYTES, 0x0008b);
831
832         ok &= vmcs_write16(GUEST_LDTR_SELECTOR, 0);
833         ok &= vmcs_write64(GUEST_LDTR_BASE, 0);
834         ok &= vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
835         ok &= vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
836
837         ok &= vmcs_write64(GUEST_GDTR_BASE, 0);
838         ok &= vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
839         ok &= vmcs_write64(GUEST_IDTR_BASE, 0);
840         ok &= vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
841
842         ok &= vmcs_write64(GUEST_IA32_EFER, 0);
843
844         ok &= vmcs_write32(GUEST_SYSENTER_CS, 0);
845         ok &= vmcs_write64(GUEST_SYSENTER_EIP, 0);
846         ok &= vmcs_write64(GUEST_SYSENTER_ESP, 0);
847
848         ok &= vmcs_write64(GUEST_DR7, 0x00000400);
849
850         ok &= vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
851         ok &= vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
852         ok &= vmcs_write64(GUEST_PENDING_DBG_EXCEPTIONS, 0);
853         ok &= vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
854
855         val = vmcs_read32(VM_ENTRY_CONTROLS);
856         val &= ~VM_ENTRY_IA32E_MODE;
857         ok &= vmcs_write32(VM_ENTRY_CONTROLS, val);
858
859         ok &= vmx_set_cell_config();
860
861         if (!ok) {
862                 panic_printk("FATAL: CPU reset failed\n");
863                 panic_stop();
864         }
865 }
866
867 static void vmx_preemption_timer_set_enable(bool enable)
868 {
869         u32 pin_based_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
870
871         if (enable)
872                 pin_based_ctrl |= PIN_BASED_VMX_PREEMPTION_TIMER;
873         else
874                 pin_based_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
875         vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, pin_based_ctrl);
876 }
877
878 void vcpu_nmi_handler(void)
879 {
880         if (this_cpu_data()->vmx_state == VMCS_READY)
881                 vmx_preemption_timer_set_enable(true);
882 }
883
884 void vcpu_park(void)
885 {
886         vcpu_vendor_reset(0);
887         vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_HLT);
888 }
889
890 void vcpu_skip_emulated_instruction(unsigned int inst_len)
891 {
892         vmcs_write64(GUEST_RIP, vmcs_read64(GUEST_RIP) + inst_len);
893 }
894
895 static void vmx_check_events(void)
896 {
897         vmx_preemption_timer_set_enable(false);
898         x86_check_events();
899 }
900
901 static void vmx_handle_exception_nmi(void)
902 {
903         u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
904
905         if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) {
906                 this_cpu_data()->stats[JAILHOUSE_CPU_STAT_VMEXITS_MANAGEMENT]++;
907                 asm volatile("int %0" : : "i" (NMI_VECTOR));
908         } else {
909                 this_cpu_data()->stats[JAILHOUSE_CPU_STAT_VMEXITS_EXCEPTION]++;
910                 /*
911                  * Reinject the event straight away. We only intercept #DB and
912                  * #AC to prevent that malicious guests can trigger infinite
913                  * loops in microcode (see e.g. CVE-2015-5307 and
914                  * CVE-2015-8104).
915                  */
916                 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
917                              intr_info & INTR_TO_VECTORING_INFO_MASK);
918                 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
919                              vmcs_read32(VM_EXIT_INTR_ERROR_CODE));
920         }
921
922         /*
923          * Check for events even in the exception case in order to maintain
924          * control over the guest if it triggered #DB or #AC loops.
925          */
926         vmx_check_events();
927 }
928
929 static void update_efer(void)
930 {
931         unsigned long efer = vmcs_read64(GUEST_IA32_EFER);
932
933         if ((efer & (EFER_LME | EFER_LMA)) != EFER_LME)
934                 return;
935
936         efer |= EFER_LMA;
937         vmcs_write64(GUEST_IA32_EFER, efer);
938         vmcs_write32(VM_ENTRY_CONTROLS,
939                      vmcs_read32(VM_ENTRY_CONTROLS) | VM_ENTRY_IA32E_MODE);
940 }
941
942 static bool vmx_handle_cr(void)
943 {
944         u64 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
945         unsigned long cr, reg, val;
946
947         cr = exit_qualification & 0xf;
948         reg = (exit_qualification >> 8) & 0xf;
949
950         switch ((exit_qualification >> 4) & 3) {
951         case 0: /* move to cr */
952                 if (reg == 4)
953                         val = vmcs_read64(GUEST_RSP);
954                 else
955                         val = this_cpu_data()->guest_regs.by_index[15 - reg];
956
957                 if (cr == 0 || cr == 4) {
958                         vcpu_skip_emulated_instruction(X86_INST_LEN_MOV_TO_CR);
959                         /* TODO: check for #GP reasons */
960                         vmx_set_guest_cr(cr ? CR4_IDX : CR0_IDX, val);
961                         if (cr == 0 && val & X86_CR0_PG)
962                                 update_efer();
963                         return true;
964                 }
965                 break;
966         default:
967                 break;
968         }
969         panic_printk("FATAL: Unhandled CR access, qualification %x\n",
970                      exit_qualification);
971         return false;
972 }
973
974 bool vcpu_get_guest_paging_structs(struct guest_paging_structures *pg_structs)
975 {
976         if (vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE) {
977                 pg_structs->root_paging = x86_64_paging;
978                 pg_structs->root_table_gphys =
979                         vmcs_read64(GUEST_CR3) & BIT_MASK(51, 12);
980         } else if (vmcs_read64(GUEST_CR0) & X86_CR0_PG &&
981                  !(vmcs_read64(GUEST_CR4) & X86_CR4_PAE)) {
982                 pg_structs->root_paging = i386_paging;
983                 pg_structs->root_table_gphys =
984                         vmcs_read64(GUEST_CR3) & BIT_MASK(31, 12);
985         } else {
986                 printk("FATAL: Unsupported paging mode\n");
987                 return false;
988         }
989         return true;
990 }
991
992 void vcpu_vendor_set_guest_pat(unsigned long val)
993 {
994         vmcs_write64(GUEST_IA32_PAT, val);
995 }
996
997 static bool vmx_handle_apic_access(void)
998 {
999         struct guest_paging_structures pg_structs;
1000         unsigned int inst_len, offset;
1001         u64 qualification;
1002         bool is_write;
1003
1004         qualification = vmcs_read64(EXIT_QUALIFICATION);
1005
1006         switch (qualification & APIC_ACCESS_TYPE_MASK) {
1007         case APIC_ACCESS_TYPE_LINEAR_READ:
1008         case APIC_ACCESS_TYPE_LINEAR_WRITE:
1009                 is_write = !!(qualification & APIC_ACCESS_TYPE_LINEAR_WRITE);
1010                 offset = qualification & APIC_ACCESS_OFFSET_MASK;
1011                 if (offset & 0x00f)
1012                         break;
1013
1014                 if (!vcpu_get_guest_paging_structs(&pg_structs))
1015                         break;
1016
1017                 inst_len = apic_mmio_access(vmcs_read64(GUEST_RIP),
1018                                             &pg_structs, offset >> 4,
1019                                             is_write);
1020                 if (!inst_len)
1021                         break;
1022
1023                 vcpu_skip_emulated_instruction(inst_len);
1024                 return true;
1025         }
1026         panic_printk("FATAL: Unhandled APIC access, "
1027                      "qualification %x\n", qualification);
1028         return false;
1029 }
1030
1031 static void dump_vm_exit_details(u32 reason)
1032 {
1033         panic_printk("qualification %x\n", vmcs_read64(EXIT_QUALIFICATION));
1034         panic_printk("vectoring info: %x interrupt info: %x\n",
1035                      vmcs_read32(IDT_VECTORING_INFO_FIELD),
1036                      vmcs_read32(VM_EXIT_INTR_INFO));
1037         if (reason == EXIT_REASON_EPT_VIOLATION ||
1038             reason == EXIT_REASON_EPT_MISCONFIG)
1039                 panic_printk("guest phys addr %p guest linear addr: %p\n",
1040                              vmcs_read64(GUEST_PHYSICAL_ADDRESS),
1041                              vmcs_read64(GUEST_LINEAR_ADDRESS));
1042 }
1043
1044 static void dump_guest_regs(union registers *guest_regs)
1045 {
1046         panic_printk("RIP: %p RSP: %p FLAGS: %x\n", vmcs_read64(GUEST_RIP),
1047                      vmcs_read64(GUEST_RSP), vmcs_read64(GUEST_RFLAGS));
1048         panic_printk("RAX: %p RBX: %p RCX: %p\n", guest_regs->rax,
1049                      guest_regs->rbx, guest_regs->rcx);
1050         panic_printk("RDX: %p RSI: %p RDI: %p\n", guest_regs->rdx,
1051                      guest_regs->rsi, guest_regs->rdi);
1052         panic_printk("CS: %x BASE: %p AR-BYTES: %x EFER.LMA %d\n",
1053                      vmcs_read64(GUEST_CS_SELECTOR),
1054                      vmcs_read64(GUEST_CS_BASE),
1055                      vmcs_read32(GUEST_CS_AR_BYTES),
1056                      !!(vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE));
1057         panic_printk("CR0: %p CR3: %p CR4: %p\n", vmcs_read64(GUEST_CR0),
1058                      vmcs_read64(GUEST_CR3), vmcs_read64(GUEST_CR4));
1059         panic_printk("EFER: %p\n", vmcs_read64(GUEST_IA32_EFER));
1060 }
1061
1062 void vcpu_vendor_get_io_intercept(struct vcpu_io_intercept *io)
1063 {
1064         u64 exitq = vmcs_read64(EXIT_QUALIFICATION);
1065
1066         /* parse exit qualification for I/O instructions (see SDM, 27.2.1 ) */
1067         io->port = (exitq >> 16) & 0xFFFF;
1068         io->size = (exitq & 0x3) + 1;
1069         io->in = !!((exitq & 0x8) >> 3);
1070         io->inst_len = vmcs_read64(VM_EXIT_INSTRUCTION_LEN);
1071         io->rep_or_str = !!(exitq & 0x30);
1072 }
1073
1074 void vcpu_vendor_get_mmio_intercept(struct vcpu_mmio_intercept *mmio)
1075 {
1076         u64 exitq = vmcs_read64(EXIT_QUALIFICATION);
1077
1078         mmio->phys_addr = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
1079         /* We don't enable dirty/accessed bit updated in EPTP,
1080          * so only read of write flags can be set, not both. */
1081         mmio->is_write = !!(exitq & 0x2);
1082 }
1083
1084 void vcpu_handle_exit(struct per_cpu *cpu_data)
1085 {
1086         u32 reason = vmcs_read32(VM_EXIT_REASON);
1087
1088         cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_TOTAL]++;
1089
1090         switch (reason) {
1091         case EXIT_REASON_EXCEPTION_NMI:
1092                 vmx_handle_exception_nmi();
1093                 return;
1094         case EXIT_REASON_PREEMPTION_TIMER:
1095                 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_MANAGEMENT]++;
1096                 vmx_check_events();
1097                 return;
1098         case EXIT_REASON_CPUID:
1099                 vcpu_handle_cpuid();
1100                 return;
1101         case EXIT_REASON_VMCALL:
1102                 vcpu_handle_hypercall();
1103                 return;
1104         case EXIT_REASON_CR_ACCESS:
1105                 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_CR]++;
1106                 if (vmx_handle_cr())
1107                         return;
1108                 break;
1109         case EXIT_REASON_MSR_READ:
1110                 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_MSR]++;
1111                 if (vcpu_handle_msr_read())
1112                         return;
1113                 break;
1114         case EXIT_REASON_MSR_WRITE:
1115                 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_MSR]++;
1116                 if (cpu_data->guest_regs.rcx == MSR_IA32_PERF_GLOBAL_CTRL) {
1117                         /* ignore writes */
1118                         vcpu_skip_emulated_instruction(X86_INST_LEN_WRMSR);
1119                         return;
1120                 } else if (vcpu_handle_msr_write())
1121                         return;
1122                 break;
1123         case EXIT_REASON_APIC_ACCESS:
1124                 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_XAPIC]++;
1125                 if (vmx_handle_apic_access())
1126                         return;
1127                 break;
1128         case EXIT_REASON_XSETBV:
1129                 if (vcpu_handle_xsetbv())
1130                         return;
1131                 break;
1132         case EXIT_REASON_IO_INSTRUCTION:
1133                 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_PIO]++;
1134                 if (vcpu_handle_io_access())
1135                         return;
1136                 break;
1137         case EXIT_REASON_EPT_VIOLATION:
1138                 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_MMIO]++;
1139                 if (vcpu_handle_mmio_access())
1140                         return;
1141                 break;
1142         default:
1143                 panic_printk("FATAL: %s, reason %d\n",
1144                              (reason & EXIT_REASONS_FAILED_VMENTRY) ?
1145                              "VM-Entry failure" : "Unhandled VM-Exit",
1146                              (u16)reason);
1147                 dump_vm_exit_details(reason);
1148                 break;
1149         }
1150         dump_guest_regs(&cpu_data->guest_regs);
1151         panic_park();
1152 }
1153
1154 void vmx_entry_failure(void)
1155 {
1156         panic_printk("FATAL: vmresume failed, error %d\n",
1157                      vmcs_read32(VM_INSTRUCTION_ERROR));
1158         panic_stop();
1159 }
1160
1161 void vcpu_vendor_get_cell_io_bitmap(struct cell *cell,
1162                                     struct vcpu_io_bitmap *iobm)
1163 {
1164         iobm->data = cell->arch.vmx.io_bitmap;
1165         iobm->size = PIO_BITMAP_PAGES * PAGE_SIZE;
1166 }
1167
1168 void vcpu_vendor_get_execution_state(struct vcpu_execution_state *x_state)
1169 {
1170         x_state->efer = vmcs_read64(GUEST_IA32_EFER);
1171         x_state->rflags = vmcs_read64(GUEST_RFLAGS);
1172         x_state->cs = vmcs_read16(GUEST_CS_SELECTOR);
1173         x_state->rip = vmcs_read64(GUEST_RIP);
1174 }
1175
1176 void enable_irq(void)
1177 {
1178         asm volatile("sti" : : : "memory");
1179 }
1180
1181 void disable_irq(void)
1182 {
1183         asm volatile("cli" : : : "memory");
1184 }