]> rtime.felk.cvut.cz Git - jailhouse.git/blob - hypervisor/arch/x86/vmx.c
x86: vmx: Remove some no longer needed includes
[jailhouse.git] / hypervisor / arch / x86 / vmx.c
1 /*
2  * Jailhouse, a Linux-based partitioning hypervisor
3  *
4  * Copyright (c) Siemens AG, 2013-2015
5  * Copyright (c) Valentine Sinitsyn, 2014
6  *
7  * Authors:
8  *  Jan Kiszka <jan.kiszka@siemens.com>
9  *  Valentine Sinitsyn <valentine.sinitsyn@gmail.com>
10  *
11  * This work is licensed under the terms of the GNU GPL, version 2.  See
12  * the COPYING file in the top-level directory.
13  */
14
15 #include <jailhouse/entry.h>
16 #include <jailhouse/paging.h>
17 #include <jailhouse/processor.h>
18 #include <jailhouse/printk.h>
19 #include <jailhouse/string.h>
20 #include <jailhouse/control.h>
21 #include <jailhouse/hypercall.h>
22 #include <asm/apic.h>
23 #include <asm/control.h>
24 #include <asm/iommu.h>
25 #include <asm/vcpu.h>
26 #include <asm/vmx.h>
27
28 #define CR0_IDX         0
29 #define CR4_IDX         1
30
31 static const struct segment invalid_seg = {
32         .access_rights = 0x10000
33 };
34
35 /* bit cleared: direct access allowed */
36 // TODO: convert to whitelist
37 static u8 __attribute__((aligned(PAGE_SIZE))) msr_bitmap[][0x2000/8] = {
38         [ VMX_MSR_BMP_0000_READ ] = {
39                 [      0/8 ...  0x26f/8 ] = 0,
40                 [  0x270/8 ...  0x277/8 ] = 0x80, /* 0x277 */
41                 [  0x278/8 ...  0x2f7/8 ] = 0,
42                 [  0x2f8/8 ...  0x2ff/8 ] = 0x80, /* 0x2ff */
43                 [  0x300/8 ...  0x7ff/8 ] = 0,
44                 [  0x800/8 ...  0x807/8 ] = 0x0c, /* 0x802, 0x803 */
45                 [  0x808/8 ...  0x80f/8 ] = 0xa5, /* 0x808, 0x80a, 0x80d, 0x80f */
46                 [  0x810/8 ...  0x817/8 ] = 0xff, /* 0x810 - 0x817 */
47                 [  0x818/8 ...  0x81f/8 ] = 0xff, /* 0x818 - 0x81f */
48                 [  0x820/8 ...  0x827/8 ] = 0xff, /* 0x820 - 0x827 */
49                 [  0x828/8 ...  0x82f/8 ] = 0x81, /* 0x828, 0x82f */
50                 [  0x830/8 ...  0x837/8 ] = 0xfd, /* 0x830, 0x832 - 0x837 */
51                 [  0x838/8 ...  0x83f/8 ] = 0x43, /* 0x838, 0x839, 0x83e */
52                 [  0x840/8 ... 0x1fff/8 ] = 0,
53         },
54         [ VMX_MSR_BMP_C000_READ ] = {
55                 [      0/8 ... 0x1fff/8 ] = 0,
56         },
57         [ VMX_MSR_BMP_0000_WRITE ] = {
58                 [      0/8 ...   0x17/8 ] = 0,
59                 [   0x18/8 ...   0x1f/8 ] = 0x08, /* 0x01b */
60                 [   0x20/8 ...  0x1ff/8 ] = 0,
61                 [  0x200/8 ...  0x277/8 ] = 0xff, /* 0x200 - 0x277 */
62                 [  0x278/8 ...  0x2f7/8 ] = 0,
63                 [  0x2f8/8 ...  0x2ff/8 ] = 0x80, /* 0x2ff */
64                 [  0x300/8 ...  0x387/8 ] = 0,
65                 [  0x388/8 ...  0x38f/8 ] = 0x80, /* 0x38f */
66                 [  0x390/8 ...  0x7ff/8 ] = 0,
67                 [  0x808/8 ...  0x80f/8 ] = 0x89, /* 0x808, 0x80b, 0x80f */
68                 [  0x810/8 ...  0x827/8 ] = 0,
69                 [  0x828/8 ...  0x82f/8 ] = 0x81, /* 0x828, 0x82f */
70                 [  0x830/8 ...  0x837/8 ] = 0xfd, /* 0x830, 0x832 - 0x837 */
71                 [  0x838/8 ...  0x83f/8 ] = 0xc1, /* 0x838, 0x83e, 0x83f */
72                 [  0x840/8 ... 0x1fff/8 ] = 0,
73         },
74         [ VMX_MSR_BMP_C000_WRITE ] = {
75                 [      0/8 ... 0x1fff/8 ] = 0,
76         },
77 };
78 static u8 __attribute__((aligned(PAGE_SIZE))) apic_access_page[PAGE_SIZE];
79 static struct paging ept_paging[EPT_PAGE_DIR_LEVELS];
80 static u32 enable_rdtscp;
81 static unsigned long cr_maybe1[2], cr_required1[2];
82
83 static bool vmxon(struct per_cpu *cpu_data)
84 {
85         unsigned long vmxon_addr;
86         u8 ok;
87
88         vmxon_addr = paging_hvirt2phys(&cpu_data->vmxon_region);
89         asm volatile(
90                 "vmxon (%1)\n\t"
91                 "seta %0"
92                 : "=rm" (ok)
93                 : "r" (&vmxon_addr), "m" (vmxon_addr)
94                 : "memory", "cc");
95         return ok;
96 }
97
98 static bool vmcs_clear(struct per_cpu *cpu_data)
99 {
100         unsigned long vmcs_addr = paging_hvirt2phys(&cpu_data->vmcs);
101         u8 ok;
102
103         asm volatile(
104                 "vmclear (%1)\n\t"
105                 "seta %0"
106                 : "=qm" (ok)
107                 : "r" (&vmcs_addr), "m" (vmcs_addr)
108                 : "memory", "cc");
109         return ok;
110 }
111
112 static bool vmcs_load(struct per_cpu *cpu_data)
113 {
114         unsigned long vmcs_addr = paging_hvirt2phys(&cpu_data->vmcs);
115         u8 ok;
116
117         asm volatile(
118                 "vmptrld (%1)\n\t"
119                 "seta %0"
120                 : "=qm" (ok)
121                 : "r" (&vmcs_addr), "m" (vmcs_addr)
122                 : "memory", "cc");
123         return ok;
124 }
125
126 static inline unsigned long vmcs_read64(unsigned long field)
127 {
128         unsigned long value;
129
130         asm volatile("vmread %1,%0" : "=r" (value) : "r" (field) : "cc");
131         return value;
132 }
133
134 static inline u16 vmcs_read16(unsigned long field)
135 {
136         return vmcs_read64(field);
137 }
138
139 static inline u32 vmcs_read32(unsigned long field)
140 {
141         return vmcs_read64(field);
142 }
143
144 static bool vmcs_write64(unsigned long field, unsigned long val)
145 {
146         u8 ok;
147
148         asm volatile(
149                 "vmwrite %1,%2\n\t"
150                 "setnz %0"
151                 : "=qm" (ok)
152                 : "r" (val), "r" (field)
153                 : "cc");
154         if (!ok)
155                 printk("FATAL: vmwrite %08lx failed, error %d, caller %p\n",
156                        field, vmcs_read32(VM_INSTRUCTION_ERROR),
157                        __builtin_return_address(0));
158         return ok;
159 }
160
161 static bool vmcs_write16(unsigned long field, u16 value)
162 {
163         return vmcs_write64(field, value);
164 }
165
166 static bool vmcs_write32(unsigned long field, u32 value)
167 {
168         return vmcs_write64(field, value);
169 }
170
171 static bool vmx_define_cr_restrictions(unsigned int cr_idx,
172                                        unsigned long maybe1,
173                                        unsigned long required1)
174 {
175         if (!cr_maybe1[cr_idx]) {
176                 cr_maybe1[cr_idx] = maybe1;
177                 cr_required1[cr_idx] = required1;
178                 return true;
179         }
180
181         return cr_maybe1[cr_idx] == maybe1 &&
182                 cr_required1[cr_idx] == required1;
183 }
184
185 static int vmx_check_features(void)
186 {
187         unsigned long vmx_proc_ctrl, vmx_proc_ctrl2, ept_cap;
188         unsigned long vmx_pin_ctrl, vmx_basic, maybe1, required1;
189         unsigned long vmx_entry_ctrl, vmx_exit_ctrl;
190
191         if (!(cpuid_ecx(1) & X86_FEATURE_VMX))
192                 return trace_error(-ENODEV);
193
194         vmx_basic = read_msr(MSR_IA32_VMX_BASIC);
195
196         /* require VMCS size <= PAGE_SIZE,
197          * VMCS memory access type == write back and
198          * availability of TRUE_*_CTLS */
199         if (((vmx_basic >> 32) & 0x1fff) > PAGE_SIZE ||
200             ((vmx_basic >> 50) & 0xf) != EPT_TYPE_WRITEBACK ||
201             !(vmx_basic & (1UL << 55)))
202                 return trace_error(-EIO);
203
204         /* require NMI exiting and preemption timer support */
205         vmx_pin_ctrl = read_msr(MSR_IA32_VMX_PINBASED_CTLS) >> 32;
206         if (!(vmx_pin_ctrl & PIN_BASED_NMI_EXITING) ||
207             !(vmx_pin_ctrl & PIN_BASED_VMX_PREEMPTION_TIMER))
208                 return trace_error(-EIO);
209
210         /* require I/O and MSR bitmap as well as secondary controls support */
211         vmx_proc_ctrl = read_msr(MSR_IA32_VMX_PROCBASED_CTLS) >> 32;
212         if (!(vmx_proc_ctrl & CPU_BASED_USE_IO_BITMAPS) ||
213             !(vmx_proc_ctrl & CPU_BASED_USE_MSR_BITMAPS) ||
214             !(vmx_proc_ctrl & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
215                 return trace_error(-EIO);
216
217         /* require disabling of CR3 access interception */
218         vmx_proc_ctrl = read_msr(MSR_IA32_VMX_TRUE_PROCBASED_CTLS);
219         if (vmx_proc_ctrl &
220             (CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING))
221                 return trace_error(-EIO);
222
223         /* require APIC access, EPT and unrestricted guest mode support */
224         vmx_proc_ctrl2 = read_msr(MSR_IA32_VMX_PROCBASED_CTLS2) >> 32;
225         ept_cap = read_msr(MSR_IA32_VMX_EPT_VPID_CAP);
226         if (!(vmx_proc_ctrl2 & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) ||
227             !(vmx_proc_ctrl2 & SECONDARY_EXEC_ENABLE_EPT) ||
228             (ept_cap & EPT_MANDATORY_FEATURES) != EPT_MANDATORY_FEATURES ||
229             !(ept_cap & (EPT_INVEPT_SINGLE | EPT_INVEPT_GLOBAL)) ||
230             !(vmx_proc_ctrl2 & SECONDARY_EXEC_UNRESTRICTED_GUEST))
231                 return trace_error(-EIO);
232
233         /* require RDTSCP if present in CPUID */
234         if (cpuid_edx(0x80000001) & X86_FEATURE_RDTSCP) {
235                 enable_rdtscp = SECONDARY_EXEC_RDTSCP;
236                 if (!(vmx_proc_ctrl2 & SECONDARY_EXEC_RDTSCP))
237                         return trace_error(-EIO);
238         }
239
240         /* require PAT and EFER save/restore */
241         vmx_entry_ctrl = read_msr(MSR_IA32_VMX_ENTRY_CTLS) >> 32;
242         vmx_exit_ctrl = read_msr(MSR_IA32_VMX_EXIT_CTLS) >> 32;
243         if (!(vmx_entry_ctrl & VM_ENTRY_LOAD_IA32_PAT) ||
244             !(vmx_entry_ctrl & VM_ENTRY_LOAD_IA32_EFER) ||
245             !(vmx_exit_ctrl & VM_EXIT_SAVE_IA32_PAT) ||
246             !(vmx_exit_ctrl & VM_EXIT_LOAD_IA32_PAT) ||
247             !(vmx_exit_ctrl & VM_EXIT_SAVE_IA32_EFER) ||
248             !(vmx_exit_ctrl & VM_EXIT_LOAD_IA32_EFER))
249                 return trace_error(-EIO);
250
251         /* require activity state HLT */
252         if (!(read_msr(MSR_IA32_VMX_MISC) & VMX_MISC_ACTIVITY_HLT))
253                 return trace_error(-EIO);
254
255         /*
256          * Retrieve/validate restrictions on CR0
257          *
258          * In addition to what the VMX MSRs tell us, make sure that
259          * - NW and CD are kept off as they are not updated on VM exit and we
260          *   don't want them enabled for performance reasons while in root mode
261          * - PE and PG can be freely chosen (by the guest) because we demand
262          *   unrestricted guest mode support anyway
263          * - ET is always on (architectural requirement)
264          */
265         maybe1 = read_msr(MSR_IA32_VMX_CR0_FIXED1) &
266                 ~(X86_CR0_NW | X86_CR0_CD);
267         required1 = (read_msr(MSR_IA32_VMX_CR0_FIXED0) &
268                 ~(X86_CR0_PE | X86_CR0_PG)) | X86_CR0_ET;
269         if (!vmx_define_cr_restrictions(CR0_IDX, maybe1, required1))
270                 return trace_error(-EIO);
271
272         /* Retrieve/validate restrictions on CR4 */
273         maybe1 = read_msr(MSR_IA32_VMX_CR4_FIXED1);
274         required1 = read_msr(MSR_IA32_VMX_CR4_FIXED0);
275         if (!vmx_define_cr_restrictions(CR4_IDX, maybe1, required1))
276                 return trace_error(-EIO);
277
278         return 0;
279 }
280
281 static void ept_set_next_pt(pt_entry_t pte, unsigned long next_pt)
282 {
283         *pte = (next_pt & 0x000ffffffffff000UL) | EPT_FLAG_READ |
284                 EPT_FLAG_WRITE | EPT_FLAG_EXECUTE;
285 }
286
287 int vcpu_vendor_init(void)
288 {
289         unsigned int n;
290         int err;
291
292         err = vmx_check_features();
293         if (err)
294                 return err;
295
296         /* derive ept_paging from very similar x86_64_paging */
297         memcpy(ept_paging, x86_64_paging, sizeof(ept_paging));
298         for (n = 0; n < EPT_PAGE_DIR_LEVELS; n++)
299                 ept_paging[n].set_next_pt = ept_set_next_pt;
300         if (!(read_msr(MSR_IA32_VMX_EPT_VPID_CAP) & EPT_1G_PAGES))
301                 ept_paging[1].page_size = 0;
302         if (!(read_msr(MSR_IA32_VMX_EPT_VPID_CAP) & EPT_2M_PAGES))
303                 ept_paging[2].page_size = 0;
304
305         if (using_x2apic) {
306                 /* allow direct x2APIC access except for ICR writes */
307                 memset(&msr_bitmap[VMX_MSR_BMP_0000_READ][MSR_X2APIC_BASE/8],
308                        0, (MSR_X2APIC_END - MSR_X2APIC_BASE + 1)/8);
309                 memset(&msr_bitmap[VMX_MSR_BMP_0000_WRITE][MSR_X2APIC_BASE/8],
310                        0, (MSR_X2APIC_END - MSR_X2APIC_BASE + 1)/8);
311                 msr_bitmap[VMX_MSR_BMP_0000_WRITE][MSR_X2APIC_ICR/8] = 0x01;
312         }
313
314         return vcpu_cell_init(&root_cell);
315 }
316
317 unsigned long arch_paging_gphys2phys(struct per_cpu *cpu_data,
318                                      unsigned long gphys, unsigned long flags)
319 {
320         return paging_virt2phys(&cpu_data->cell->vmx.ept_structs, gphys,
321                                 flags);
322 }
323
324 int vcpu_vendor_cell_init(struct cell *cell)
325 {
326         int err = -ENOMEM;
327
328         /* allocate io_bitmap */
329         cell->vmx.io_bitmap = page_alloc(&mem_pool, 2);
330         if (!cell->vmx.io_bitmap)
331                 return err;
332
333         /* build root EPT of cell */
334         cell->vmx.ept_structs.root_paging = ept_paging;
335         cell->vmx.ept_structs.root_table = (page_table_t)cell->root_table_page;
336
337         err = paging_create(&cell->vmx.ept_structs,
338                             paging_hvirt2phys(apic_access_page),
339                             PAGE_SIZE, XAPIC_BASE,
340                             EPT_FLAG_READ | EPT_FLAG_WRITE | EPT_FLAG_WB_TYPE,
341                             PAGING_NON_COHERENT);
342         if (err)
343                 goto err_free_io_bitmap;
344
345         return 0;
346
347 err_free_io_bitmap:
348         page_free(&mem_pool, cell->vmx.io_bitmap, 2);
349
350         return err;
351 }
352
353 int vcpu_map_memory_region(struct cell *cell,
354                            const struct jailhouse_memory *mem)
355 {
356         u64 phys_start = mem->phys_start;
357         u32 flags = EPT_FLAG_WB_TYPE;
358
359         if (mem->flags & JAILHOUSE_MEM_READ)
360                 flags |= EPT_FLAG_READ;
361         if (mem->flags & JAILHOUSE_MEM_WRITE)
362                 flags |= EPT_FLAG_WRITE;
363         if (mem->flags & JAILHOUSE_MEM_EXECUTE)
364                 flags |= EPT_FLAG_EXECUTE;
365         if (mem->flags & JAILHOUSE_MEM_COMM_REGION)
366                 phys_start = paging_hvirt2phys(&cell->comm_page);
367
368         return paging_create(&cell->vmx.ept_structs, phys_start, mem->size,
369                              mem->virt_start, flags, PAGING_NON_COHERENT);
370 }
371
372 int vcpu_unmap_memory_region(struct cell *cell,
373                              const struct jailhouse_memory *mem)
374 {
375         return paging_destroy(&cell->vmx.ept_structs, mem->virt_start,
376                               mem->size, PAGING_NON_COHERENT);
377 }
378
379 void vcpu_vendor_cell_exit(struct cell *cell)
380 {
381         paging_destroy(&cell->vmx.ept_structs, XAPIC_BASE, PAGE_SIZE,
382                        PAGING_NON_COHERENT);
383         page_free(&mem_pool, cell->vmx.io_bitmap, 2);
384 }
385
386 void vcpu_tlb_flush(void)
387 {
388         unsigned long ept_cap = read_msr(MSR_IA32_VMX_EPT_VPID_CAP);
389         struct {
390                 u64 eptp;
391                 u64 reserved;
392         } descriptor;
393         u64 type;
394         u8 ok;
395
396         descriptor.reserved = 0;
397         if (ept_cap & EPT_INVEPT_SINGLE) {
398                 type = VMX_INVEPT_SINGLE;
399                 descriptor.eptp = vmcs_read64(EPT_POINTER);
400         } else {
401                 type = VMX_INVEPT_GLOBAL;
402                 descriptor.eptp = 0;
403         }
404         asm volatile(
405                 "invept (%1),%2\n\t"
406                 "seta %0\n\t"
407                 : "=qm" (ok)
408                 : "r" (&descriptor), "r" (type)
409                 : "memory", "cc");
410
411         if (!ok) {
412                 panic_printk("FATAL: invept failed, error %d\n",
413                              vmcs_read32(VM_INSTRUCTION_ERROR));
414                 panic_stop();
415         }
416 }
417
418 static bool vmx_set_guest_cr(unsigned int cr_idx, unsigned long val)
419 {
420         bool ok = true;
421
422         if (cr_idx)
423                 val |= X86_CR4_VMXE; /* keeps the hypervisor visible */
424
425         ok &= vmcs_write64(cr_idx ? GUEST_CR4 : GUEST_CR0,
426                            (val & cr_maybe1[cr_idx]) | cr_required1[cr_idx]);
427         ok &= vmcs_write64(cr_idx ? CR4_READ_SHADOW : CR0_READ_SHADOW, val);
428         ok &= vmcs_write64(cr_idx ? CR4_GUEST_HOST_MASK : CR0_GUEST_HOST_MASK,
429                            cr_required1[cr_idx] | ~cr_maybe1[cr_idx]);
430
431         return ok;
432 }
433
434 static bool vmx_set_cell_config(void)
435 {
436         struct cell *cell = this_cell();
437         u8 *io_bitmap;
438         bool ok = true;
439
440         io_bitmap = cell->vmx.io_bitmap;
441         ok &= vmcs_write64(IO_BITMAP_A, paging_hvirt2phys(io_bitmap));
442         ok &= vmcs_write64(IO_BITMAP_B,
443                            paging_hvirt2phys(io_bitmap + PAGE_SIZE));
444
445         ok &= vmcs_write64(EPT_POINTER,
446                         paging_hvirt2phys(cell->vmx.ept_structs.root_table) |
447                         EPT_TYPE_WRITEBACK | EPT_PAGE_WALK_LEN);
448
449         return ok;
450 }
451
452 static bool vmx_set_guest_segment(const struct segment *seg,
453                                   unsigned long selector_field)
454 {
455         bool ok = true;
456
457         ok &= vmcs_write16(selector_field, seg->selector);
458         ok &= vmcs_write64(selector_field + GUEST_SEG_BASE, seg->base);
459         ok &= vmcs_write32(selector_field + GUEST_SEG_LIMIT, seg->limit);
460         ok &= vmcs_write32(selector_field + GUEST_SEG_AR_BYTES,
461                            seg->access_rights);
462         return ok;
463 }
464
465 static bool vmcs_setup(struct per_cpu *cpu_data)
466 {
467         struct desc_table_reg dtr;
468         unsigned long val;
469         bool ok = true;
470
471         ok &= vmcs_write64(HOST_CR0, read_cr0());
472         ok &= vmcs_write64(HOST_CR3, read_cr3());
473         ok &= vmcs_write64(HOST_CR4, read_cr4());
474
475         ok &= vmcs_write16(HOST_CS_SELECTOR, GDT_DESC_CODE * 8);
476         ok &= vmcs_write16(HOST_DS_SELECTOR, 0);
477         ok &= vmcs_write16(HOST_ES_SELECTOR, 0);
478         ok &= vmcs_write16(HOST_SS_SELECTOR, 0);
479         ok &= vmcs_write16(HOST_FS_SELECTOR, 0);
480         ok &= vmcs_write16(HOST_GS_SELECTOR, 0);
481         ok &= vmcs_write16(HOST_TR_SELECTOR, GDT_DESC_TSS * 8);
482
483         ok &= vmcs_write64(HOST_FS_BASE, 0);
484         ok &= vmcs_write64(HOST_GS_BASE, read_msr(MSR_GS_BASE));
485         ok &= vmcs_write64(HOST_TR_BASE, 0);
486
487         read_gdtr(&dtr);
488         ok &= vmcs_write64(HOST_GDTR_BASE, dtr.base);
489         read_idtr(&dtr);
490         ok &= vmcs_write64(HOST_IDTR_BASE, dtr.base);
491
492         ok &= vmcs_write64(HOST_IA32_PAT, read_msr(MSR_IA32_PAT));
493         ok &= vmcs_write64(HOST_IA32_EFER, EFER_LMA | EFER_LME);
494
495         ok &= vmcs_write32(HOST_IA32_SYSENTER_CS, 0);
496         ok &= vmcs_write64(HOST_IA32_SYSENTER_EIP, 0);
497         ok &= vmcs_write64(HOST_IA32_SYSENTER_ESP, 0);
498
499         ok &= vmcs_write64(HOST_RSP, (unsigned long)cpu_data->stack +
500                            sizeof(cpu_data->stack));
501         ok &= vmcs_write64(HOST_RIP, (unsigned long)vmx_vmexit);
502
503         ok &= vmx_set_guest_cr(CR0_IDX, cpu_data->linux_cr0);
504         ok &= vmx_set_guest_cr(CR4_IDX, cpu_data->linux_cr4);
505
506         ok &= vmcs_write64(GUEST_CR3, cpu_data->linux_cr3);
507
508         ok &= vmx_set_guest_segment(&cpu_data->linux_cs, GUEST_CS_SELECTOR);
509         ok &= vmx_set_guest_segment(&cpu_data->linux_ds, GUEST_DS_SELECTOR);
510         ok &= vmx_set_guest_segment(&cpu_data->linux_es, GUEST_ES_SELECTOR);
511         ok &= vmx_set_guest_segment(&cpu_data->linux_fs, GUEST_FS_SELECTOR);
512         ok &= vmx_set_guest_segment(&cpu_data->linux_gs, GUEST_GS_SELECTOR);
513         ok &= vmx_set_guest_segment(&invalid_seg, GUEST_SS_SELECTOR);
514         ok &= vmx_set_guest_segment(&cpu_data->linux_tss, GUEST_TR_SELECTOR);
515         ok &= vmx_set_guest_segment(&invalid_seg, GUEST_LDTR_SELECTOR);
516
517         ok &= vmcs_write64(GUEST_GDTR_BASE, cpu_data->linux_gdtr.base);
518         ok &= vmcs_write32(GUEST_GDTR_LIMIT, cpu_data->linux_gdtr.limit);
519         ok &= vmcs_write64(GUEST_IDTR_BASE, cpu_data->linux_idtr.base);
520         ok &= vmcs_write32(GUEST_IDTR_LIMIT, cpu_data->linux_idtr.limit);
521
522         ok &= vmcs_write64(GUEST_RFLAGS, 0x02);
523         ok &= vmcs_write64(GUEST_RSP, cpu_data->linux_sp +
524                            (NUM_ENTRY_REGS + 1) * sizeof(unsigned long));
525         ok &= vmcs_write64(GUEST_RIP, cpu_data->linux_ip);
526
527         ok &= vmcs_write32(GUEST_SYSENTER_CS,
528                            read_msr(MSR_IA32_SYSENTER_CS));
529         ok &= vmcs_write64(GUEST_SYSENTER_EIP,
530                            read_msr(MSR_IA32_SYSENTER_EIP));
531         ok &= vmcs_write64(GUEST_SYSENTER_ESP,
532                            read_msr(MSR_IA32_SYSENTER_ESP));
533
534         ok &= vmcs_write64(GUEST_DR7, 0x00000400);
535         ok &= vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
536
537         ok &= vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
538         ok &= vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
539         ok &= vmcs_write64(GUEST_PENDING_DBG_EXCEPTIONS, 0);
540
541         ok &= vmcs_write64(GUEST_IA32_PAT, cpu_data->pat);
542         ok &= vmcs_write64(GUEST_IA32_EFER, cpu_data->linux_efer);
543
544         ok &= vmcs_write64(VMCS_LINK_POINTER, -1UL);
545         ok &= vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
546
547         val = read_msr(MSR_IA32_VMX_PINBASED_CTLS);
548         val |= PIN_BASED_NMI_EXITING;
549         ok &= vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, val);
550
551         ok &= vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0);
552
553         val = read_msr(MSR_IA32_VMX_PROCBASED_CTLS);
554         val |= CPU_BASED_USE_IO_BITMAPS | CPU_BASED_USE_MSR_BITMAPS |
555                 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
556         val &= ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
557         ok &= vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, val);
558
559         ok &= vmcs_write64(MSR_BITMAP, paging_hvirt2phys(msr_bitmap));
560
561         val = read_msr(MSR_IA32_VMX_PROCBASED_CTLS2);
562         val |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
563                 SECONDARY_EXEC_ENABLE_EPT | SECONDARY_EXEC_UNRESTRICTED_GUEST |
564                 enable_rdtscp;
565         ok &= vmcs_write32(SECONDARY_VM_EXEC_CONTROL, val);
566
567         ok &= vmcs_write64(APIC_ACCESS_ADDR,
568                            paging_hvirt2phys(apic_access_page));
569
570         ok &= vmx_set_cell_config();
571
572         ok &= vmcs_write32(EXCEPTION_BITMAP, 0);
573
574         val = read_msr(MSR_IA32_VMX_EXIT_CTLS);
575         val |= VM_EXIT_HOST_ADDR_SPACE_SIZE |
576                 VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT |
577                 VM_EXIT_SAVE_IA32_EFER | VM_EXIT_LOAD_IA32_EFER;
578         ok &= vmcs_write32(VM_EXIT_CONTROLS, val);
579
580         ok &= vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
581         ok &= vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
582         ok &= vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
583
584         val = read_msr(MSR_IA32_VMX_ENTRY_CTLS);
585         val |= VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_PAT |
586                 VM_ENTRY_LOAD_IA32_EFER;
587         ok &= vmcs_write32(VM_ENTRY_CONTROLS, val);
588
589         ok &= vmcs_write64(CR4_GUEST_HOST_MASK, 0);
590
591         ok &= vmcs_write32(CR3_TARGET_COUNT, 0);
592
593         return ok;
594 }
595
596 int vcpu_init(struct per_cpu *cpu_data)
597 {
598         unsigned long feature_ctrl, mask;
599         u32 revision_id;
600         int err;
601
602         /* make sure all perf counters are off */
603         if ((cpuid_eax(0x0a) & 0xff) > 0)
604                 write_msr(MSR_IA32_PERF_GLOBAL_CTRL, 0);
605
606         if (cpu_data->linux_cr4 & X86_CR4_VMXE)
607                 return trace_error(-EBUSY);
608
609         err = vmx_check_features();
610         if (err)
611                 return err;
612
613         revision_id = (u32)read_msr(MSR_IA32_VMX_BASIC);
614         cpu_data->vmxon_region.revision_id = revision_id;
615         cpu_data->vmxon_region.shadow_indicator = 0;
616         cpu_data->vmcs.revision_id = revision_id;
617         cpu_data->vmcs.shadow_indicator = 0;
618
619         /* Note: We assume that TXT is off */
620         feature_ctrl = read_msr(MSR_IA32_FEATURE_CONTROL);
621         mask = FEATURE_CONTROL_LOCKED |
622                 FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
623
624         if ((feature_ctrl & mask) != mask) {
625                 if (feature_ctrl & FEATURE_CONTROL_LOCKED)
626                         return trace_error(-ENODEV);
627
628                 feature_ctrl |= mask;
629                 write_msr(MSR_IA32_FEATURE_CONTROL, feature_ctrl);
630         }
631
632         /*
633          * SDM Volume 3, 2.5: "When loading a control register, reserved bits
634          * should always be set to the values previously read."
635          * But we want to avoid surprises with new features unknown to us but
636          * set by Linux. So check if any assumed revered bit was set or should
637          * be set for VMX operation and bail out if so.
638          */
639         if ((cpu_data->linux_cr0 | cr_required1[CR0_IDX]) & X86_CR0_RESERVED ||
640             (cpu_data->linux_cr4 | cr_required1[CR4_IDX]) & X86_CR4_RESERVED)
641                 return -EIO;
642         /*
643          * Bring CR0 and CR4 into well-defined states. If they do not match
644          * with VMX requirements, vmxon will fail.
645          * X86_CR4_OSXSAVE is enabled if available so that xsetbv can be
646          * executed on behalf of a cell.
647          */
648         write_cr0(X86_CR0_HOST_STATE);
649         write_cr4(X86_CR4_HOST_STATE | X86_CR4_VMXE |
650                   ((cpuid_ecx(1) & X86_FEATURE_XSAVE) ? X86_CR4_OSXSAVE : 0));
651
652         if (!vmxon(cpu_data))  {
653                 write_cr4(cpu_data->linux_cr4);
654                 return trace_error(-EIO);
655         }
656
657         cpu_data->vmx_state = VMXON;
658
659         if (!vmcs_clear(cpu_data) ||
660             !vmcs_load(cpu_data) ||
661             !vmcs_setup(cpu_data))
662                 return trace_error(-EIO);
663
664         cpu_data->vmx_state = VMCS_READY;
665
666         return 0;
667 }
668
669 void vcpu_exit(struct per_cpu *cpu_data)
670 {
671         if (cpu_data->vmx_state == VMXOFF)
672                 return;
673
674         cpu_data->vmx_state = VMXOFF;
675         /* Write vmx_state to ensure that vcpu_nmi_handler stops accessing
676          * the VMCS (a compiler barrier would be sufficient, in fact). */
677         memory_barrier();
678
679         vmcs_clear(cpu_data);
680         asm volatile("vmxoff" : : : "cc");
681         cpu_data->linux_cr4 &= ~X86_CR4_VMXE;
682 }
683
684 void __attribute__((noreturn)) vcpu_activate_vmm(struct per_cpu *cpu_data)
685 {
686         /* We enter Linux at the point arch_entry would return to as well.
687          * rax is cleared to signal success to the caller. */
688         asm volatile(
689                 "mov (%%rdi),%%r15\n\t"
690                 "mov 0x8(%%rdi),%%r14\n\t"
691                 "mov 0x10(%%rdi),%%r13\n\t"
692                 "mov 0x18(%%rdi),%%r12\n\t"
693                 "mov 0x20(%%rdi),%%rbx\n\t"
694                 "mov 0x28(%%rdi),%%rbp\n\t"
695                 "vmlaunch\n\t"
696                 "pop %%rbp"
697                 : /* no output */
698                 : "a" (0), "D" (cpu_data->linux_reg)
699                 : "memory", "r15", "r14", "r13", "r12", "rbx", "rbp", "cc");
700
701         panic_printk("FATAL: vmlaunch failed, error %d\n",
702                      vmcs_read32(VM_INSTRUCTION_ERROR));
703         panic_stop();
704 }
705
706 void __attribute__((noreturn)) vcpu_deactivate_vmm(void)
707 {
708         unsigned long *stack = (unsigned long *)vmcs_read64(GUEST_RSP);
709         unsigned long linux_ip = vmcs_read64(GUEST_RIP);
710         struct per_cpu *cpu_data = this_cpu_data();
711
712         cpu_data->linux_cr0 = vmcs_read64(GUEST_CR0);
713         cpu_data->linux_cr3 = vmcs_read64(GUEST_CR3);
714         cpu_data->linux_cr4 = vmcs_read64(GUEST_CR4);
715
716         cpu_data->linux_gdtr.base = vmcs_read64(GUEST_GDTR_BASE);
717         cpu_data->linux_gdtr.limit = vmcs_read64(GUEST_GDTR_LIMIT);
718         cpu_data->linux_idtr.base = vmcs_read64(GUEST_IDTR_BASE);
719         cpu_data->linux_idtr.limit = vmcs_read64(GUEST_IDTR_LIMIT);
720
721         cpu_data->linux_cs.selector = vmcs_read32(GUEST_CS_SELECTOR);
722
723         cpu_data->linux_tss.selector = vmcs_read32(GUEST_TR_SELECTOR);
724
725         cpu_data->linux_efer = vmcs_read64(GUEST_IA32_EFER);
726         cpu_data->linux_fs.base = vmcs_read64(GUEST_FS_BASE);
727         cpu_data->linux_gs.base = vmcs_read64(GUEST_GS_BASE);
728
729         write_msr(MSR_IA32_SYSENTER_CS, vmcs_read32(GUEST_SYSENTER_CS));
730         write_msr(MSR_IA32_SYSENTER_EIP, vmcs_read64(GUEST_SYSENTER_EIP));
731         write_msr(MSR_IA32_SYSENTER_ESP, vmcs_read64(GUEST_SYSENTER_ESP));
732
733         cpu_data->linux_ds.selector = vmcs_read16(GUEST_DS_SELECTOR);
734         cpu_data->linux_es.selector = vmcs_read16(GUEST_ES_SELECTOR);
735         cpu_data->linux_fs.selector = vmcs_read16(GUEST_FS_SELECTOR);
736         cpu_data->linux_gs.selector = vmcs_read16(GUEST_GS_SELECTOR);
737
738         arch_cpu_restore(cpu_data, 0);
739
740         stack--;
741         *stack = linux_ip;
742
743         asm volatile (
744                 "mov %%rbx,%%rsp\n\t"
745                 "pop %%r15\n\t"
746                 "pop %%r14\n\t"
747                 "pop %%r13\n\t"
748                 "pop %%r12\n\t"
749                 "pop %%r11\n\t"
750                 "pop %%r10\n\t"
751                 "pop %%r9\n\t"
752                 "pop %%r8\n\t"
753                 "pop %%rdi\n\t"
754                 "pop %%rsi\n\t"
755                 "pop %%rbp\n\t"
756                 "add $8,%%rsp\n\t"
757                 "pop %%rbx\n\t"
758                 "pop %%rdx\n\t"
759                 "pop %%rcx\n\t"
760                 "mov %%rax,%%rsp\n\t"
761                 "xor %%rax,%%rax\n\t"
762                 "ret"
763                 : : "a" (stack), "b" (&cpu_data->guest_regs));
764         __builtin_unreachable();
765 }
766
767 static void vmx_vcpu_reset(unsigned int sipi_vector)
768 {
769         unsigned long val;
770         bool ok = true;
771
772         ok &= vmx_set_guest_cr(CR0_IDX, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
773         ok &= vmx_set_guest_cr(CR4_IDX, 0);
774
775         ok &= vmcs_write64(GUEST_CR3, 0);
776
777         ok &= vmcs_write64(GUEST_RFLAGS, 0x02);
778         ok &= vmcs_write64(GUEST_RSP, 0);
779
780         val = 0;
781         if (sipi_vector == APIC_BSP_PSEUDO_SIPI) {
782                 val = 0xfff0;
783                 sipi_vector = 0xf0;
784
785                 /* only cleared on hard reset */
786                 ok &= vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
787         }
788         ok &= vmcs_write64(GUEST_RIP, val);
789
790         ok &= vmcs_write16(GUEST_CS_SELECTOR, sipi_vector << 8);
791         ok &= vmcs_write64(GUEST_CS_BASE, sipi_vector << 12);
792         ok &= vmcs_write32(GUEST_CS_LIMIT, 0xffff);
793         ok &= vmcs_write32(GUEST_CS_AR_BYTES, 0x0009b);
794
795         ok &= vmcs_write16(GUEST_DS_SELECTOR, 0);
796         ok &= vmcs_write64(GUEST_DS_BASE, 0);
797         ok &= vmcs_write32(GUEST_DS_LIMIT, 0xffff);
798         ok &= vmcs_write32(GUEST_DS_AR_BYTES, 0x00093);
799
800         ok &= vmcs_write16(GUEST_ES_SELECTOR, 0);
801         ok &= vmcs_write64(GUEST_ES_BASE, 0);
802         ok &= vmcs_write32(GUEST_ES_LIMIT, 0xffff);
803         ok &= vmcs_write32(GUEST_ES_AR_BYTES, 0x00093);
804
805         ok &= vmcs_write16(GUEST_FS_SELECTOR, 0);
806         ok &= vmcs_write64(GUEST_FS_BASE, 0);
807         ok &= vmcs_write32(GUEST_FS_LIMIT, 0xffff);
808         ok &= vmcs_write32(GUEST_FS_AR_BYTES, 0x00093);
809
810         ok &= vmcs_write16(GUEST_GS_SELECTOR, 0);
811         ok &= vmcs_write64(GUEST_GS_BASE, 0);
812         ok &= vmcs_write32(GUEST_GS_LIMIT, 0xffff);
813         ok &= vmcs_write32(GUEST_GS_AR_BYTES, 0x00093);
814
815         ok &= vmcs_write16(GUEST_SS_SELECTOR, 0);
816         ok &= vmcs_write64(GUEST_SS_BASE, 0);
817         ok &= vmcs_write32(GUEST_SS_LIMIT, 0xffff);
818         ok &= vmcs_write32(GUEST_SS_AR_BYTES, 0x00093);
819
820         ok &= vmcs_write16(GUEST_TR_SELECTOR, 0);
821         ok &= vmcs_write64(GUEST_TR_BASE, 0);
822         ok &= vmcs_write32(GUEST_TR_LIMIT, 0xffff);
823         ok &= vmcs_write32(GUEST_TR_AR_BYTES, 0x0008b);
824
825         ok &= vmcs_write16(GUEST_LDTR_SELECTOR, 0);
826         ok &= vmcs_write64(GUEST_LDTR_BASE, 0);
827         ok &= vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
828         ok &= vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
829
830         ok &= vmcs_write64(GUEST_GDTR_BASE, 0);
831         ok &= vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
832         ok &= vmcs_write64(GUEST_IDTR_BASE, 0);
833         ok &= vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
834
835         ok &= vmcs_write64(GUEST_IA32_EFER, 0);
836
837         ok &= vmcs_write32(GUEST_SYSENTER_CS, 0);
838         ok &= vmcs_write64(GUEST_SYSENTER_EIP, 0);
839         ok &= vmcs_write64(GUEST_SYSENTER_ESP, 0);
840
841         ok &= vmcs_write64(GUEST_DR7, 0x00000400);
842
843         ok &= vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
844         ok &= vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
845         ok &= vmcs_write64(GUEST_PENDING_DBG_EXCEPTIONS, 0);
846
847         val = vmcs_read32(VM_ENTRY_CONTROLS);
848         val &= ~VM_ENTRY_IA32E_MODE;
849         ok &= vmcs_write32(VM_ENTRY_CONTROLS, val);
850
851         ok &= vmx_set_cell_config();
852
853         if (!ok) {
854                 panic_printk("FATAL: CPU reset failed\n");
855                 panic_stop();
856         }
857 }
858
859 void vcpu_nmi_handler(void)
860 {
861         u32 pin_based_ctrl;
862
863         if (this_cpu_data()->vmx_state != VMCS_READY)
864                 return;
865
866         pin_based_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
867         pin_based_ctrl |= PIN_BASED_VMX_PREEMPTION_TIMER;
868         vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, pin_based_ctrl);
869 }
870
871 void vcpu_park(void)
872 {
873         vmx_vcpu_reset(0);
874         vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_HLT);
875 }
876
877 static void vmx_disable_preemption_timer(void)
878 {
879         u32 pin_based_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
880
881         pin_based_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
882         vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, pin_based_ctrl);
883 }
884
885 void vcpu_skip_emulated_instruction(unsigned int inst_len)
886 {
887         vmcs_write64(GUEST_RIP, vmcs_read64(GUEST_RIP) + inst_len);
888 }
889
890 static void update_efer(void)
891 {
892         unsigned long efer = vmcs_read64(GUEST_IA32_EFER);
893
894         if ((efer & (EFER_LME | EFER_LMA)) != EFER_LME)
895                 return;
896
897         efer |= EFER_LMA;
898         vmcs_write64(GUEST_IA32_EFER, efer);
899         vmcs_write32(VM_ENTRY_CONTROLS,
900                      vmcs_read32(VM_ENTRY_CONTROLS) | VM_ENTRY_IA32E_MODE);
901 }
902
903 static bool vmx_handle_cr(void)
904 {
905         u64 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
906         unsigned long cr, reg, val;
907
908         cr = exit_qualification & 0xf;
909         reg = (exit_qualification >> 8) & 0xf;
910
911         switch ((exit_qualification >> 4) & 3) {
912         case 0: /* move to cr */
913                 if (reg == 4)
914                         val = vmcs_read64(GUEST_RSP);
915                 else
916                         val = this_cpu_data()->guest_regs.by_index[15 - reg];
917
918                 if (cr == 0 || cr == 4) {
919                         vcpu_skip_emulated_instruction(X86_INST_LEN_MOV_TO_CR);
920                         /* TODO: check for #GP reasons */
921                         vmx_set_guest_cr(cr ? CR4_IDX : CR0_IDX, val);
922                         if (cr == 0 && val & X86_CR0_PG)
923                                 update_efer();
924                         return true;
925                 }
926                 break;
927         default:
928                 break;
929         }
930         panic_printk("FATAL: Unhandled CR access, qualification %x\n",
931                      exit_qualification);
932         return false;
933 }
934
935 bool vcpu_get_guest_paging_structs(struct guest_paging_structures *pg_structs)
936 {
937         if (vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE) {
938                 pg_structs->root_paging = x86_64_paging;
939                 pg_structs->root_table_gphys =
940                         vmcs_read64(GUEST_CR3) & 0x000ffffffffff000UL;
941         } else if (vmcs_read64(GUEST_CR0) & X86_CR0_PG &&
942                  !(vmcs_read64(GUEST_CR4) & X86_CR4_PAE)) {
943                 pg_structs->root_paging = i386_paging;
944                 pg_structs->root_table_gphys =
945                         vmcs_read64(GUEST_CR3) & 0xfffff000UL;
946         } else {
947                 printk("FATAL: Unsupported paging mode\n");
948                 return false;
949         }
950         return true;
951 }
952
953 void vcpu_vendor_set_guest_pat(unsigned long val)
954 {
955         vmcs_write64(GUEST_IA32_PAT, val);
956 }
957
958 static bool vmx_handle_apic_access(void)
959 {
960         struct guest_paging_structures pg_structs;
961         unsigned int inst_len, offset;
962         u64 qualification;
963         bool is_write;
964
965         qualification = vmcs_read64(EXIT_QUALIFICATION);
966
967         switch (qualification & APIC_ACCESS_TYPE_MASK) {
968         case APIC_ACCESS_TYPE_LINEAR_READ:
969         case APIC_ACCESS_TYPE_LINEAR_WRITE:
970                 is_write = !!(qualification & APIC_ACCESS_TYPE_LINEAR_WRITE);
971                 offset = qualification & APIC_ACCESS_OFFSET_MASK;
972                 if (offset & 0x00f)
973                         break;
974
975                 if (!vcpu_get_guest_paging_structs(&pg_structs))
976                         break;
977
978                 inst_len = apic_mmio_access(vmcs_read64(GUEST_RIP),
979                                             &pg_structs, offset >> 4,
980                                             is_write);
981                 if (!inst_len)
982                         break;
983
984                 vcpu_skip_emulated_instruction(inst_len);
985                 return true;
986         }
987         panic_printk("FATAL: Unhandled APIC access, "
988                      "qualification %x\n", qualification);
989         return false;
990 }
991
992 static void dump_vm_exit_details(u32 reason)
993 {
994         panic_printk("qualification %x\n", vmcs_read64(EXIT_QUALIFICATION));
995         panic_printk("vectoring info: %x interrupt info: %x\n",
996                      vmcs_read32(IDT_VECTORING_INFO_FIELD),
997                      vmcs_read32(VM_EXIT_INTR_INFO));
998         if (reason == EXIT_REASON_EPT_VIOLATION ||
999             reason == EXIT_REASON_EPT_MISCONFIG)
1000                 panic_printk("guest phys addr %p guest linear addr: %p\n",
1001                              vmcs_read64(GUEST_PHYSICAL_ADDRESS),
1002                              vmcs_read64(GUEST_LINEAR_ADDRESS));
1003 }
1004
1005 static void dump_guest_regs(union registers *guest_regs)
1006 {
1007         panic_printk("RIP: %p RSP: %p FLAGS: %x\n", vmcs_read64(GUEST_RIP),
1008                      vmcs_read64(GUEST_RSP), vmcs_read64(GUEST_RFLAGS));
1009         panic_printk("RAX: %p RBX: %p RCX: %p\n", guest_regs->rax,
1010                      guest_regs->rbx, guest_regs->rcx);
1011         panic_printk("RDX: %p RSI: %p RDI: %p\n", guest_regs->rdx,
1012                      guest_regs->rsi, guest_regs->rdi);
1013         panic_printk("CS: %x BASE: %p AR-BYTES: %x EFER.LMA %d\n",
1014                      vmcs_read64(GUEST_CS_SELECTOR),
1015                      vmcs_read64(GUEST_CS_BASE),
1016                      vmcs_read32(GUEST_CS_AR_BYTES),
1017                      !!(vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE));
1018         panic_printk("CR0: %p CR3: %p CR4: %p\n", vmcs_read64(GUEST_CR0),
1019                      vmcs_read64(GUEST_CR3), vmcs_read64(GUEST_CR4));
1020         panic_printk("EFER: %p\n", vmcs_read64(GUEST_IA32_EFER));
1021 }
1022
1023 void vcpu_vendor_get_io_intercept(struct vcpu_io_intercept *io)
1024 {
1025         u64 exitq = vmcs_read64(EXIT_QUALIFICATION);
1026
1027         /* parse exit qualification for I/O instructions (see SDM, 27.2.1 ) */
1028         io->port = (exitq >> 16) & 0xFFFF;
1029         io->size = (exitq & 0x3) + 1;
1030         io->in = !!((exitq & 0x8) >> 3);
1031         io->inst_len = vmcs_read64(VM_EXIT_INSTRUCTION_LEN);
1032         io->rep_or_str = !!(exitq & 0x30);
1033 }
1034
1035 void vcpu_vendor_get_mmio_intercept(struct vcpu_mmio_intercept *mmio)
1036 {
1037         u64 exitq = vmcs_read64(EXIT_QUALIFICATION);
1038
1039         mmio->phys_addr = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
1040         /* We don't enable dirty/accessed bit updated in EPTP,
1041          * so only read of write flags can be set, not both. */
1042         mmio->is_write = !!(exitq & 0x2);
1043 }
1044
1045 void vcpu_handle_exit(struct per_cpu *cpu_data)
1046 {
1047         u32 reason = vmcs_read32(VM_EXIT_REASON);
1048         int sipi_vector;
1049
1050         cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_TOTAL]++;
1051
1052         switch (reason) {
1053         case EXIT_REASON_EXCEPTION_NMI:
1054                 asm volatile("int %0" : : "i" (NMI_VECTOR));
1055                 /* fall through */
1056         case EXIT_REASON_PREEMPTION_TIMER:
1057                 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_MANAGEMENT]++;
1058                 vmx_disable_preemption_timer();
1059                 sipi_vector = x86_handle_events(cpu_data);
1060                 if (sipi_vector >= 0) {
1061                         printk("CPU %d received SIPI, vector %x\n",
1062                                cpu_data->cpu_id, sipi_vector);
1063                         vmx_vcpu_reset(sipi_vector);
1064                         vcpu_reset(sipi_vector == APIC_BSP_PSEUDO_SIPI);
1065                 }
1066                 iommu_check_pending_faults();
1067                 return;
1068         case EXIT_REASON_CPUID:
1069                 vcpu_handle_cpuid();
1070                 return;
1071         case EXIT_REASON_VMCALL:
1072                 vcpu_handle_hypercall();
1073                 return;
1074         case EXIT_REASON_CR_ACCESS:
1075                 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_CR]++;
1076                 if (vmx_handle_cr())
1077                         return;
1078                 break;
1079         case EXIT_REASON_MSR_READ:
1080                 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_MSR]++;
1081                 if (vcpu_handle_msr_read())
1082                         return;
1083                 break;
1084         case EXIT_REASON_MSR_WRITE:
1085                 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_MSR]++;
1086                 if (cpu_data->guest_regs.rcx == MSR_IA32_PERF_GLOBAL_CTRL) {
1087                         /* ignore writes */
1088                         vcpu_skip_emulated_instruction(X86_INST_LEN_WRMSR);
1089                         return;
1090                 } else if (vcpu_handle_msr_write())
1091                         return;
1092                 break;
1093         case EXIT_REASON_APIC_ACCESS:
1094                 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_XAPIC]++;
1095                 if (vmx_handle_apic_access())
1096                         return;
1097                 break;
1098         case EXIT_REASON_XSETBV:
1099                 if (vcpu_handle_xsetbv())
1100                         return;
1101                 break;
1102         case EXIT_REASON_IO_INSTRUCTION:
1103                 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_PIO]++;
1104                 if (vcpu_handle_io_access())
1105                         return;
1106                 break;
1107         case EXIT_REASON_EPT_VIOLATION:
1108                 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_MMIO]++;
1109                 if (vcpu_handle_mmio_access())
1110                         return;
1111                 break;
1112         default:
1113                 panic_printk("FATAL: %s, reason %d\n",
1114                              (reason & EXIT_REASONS_FAILED_VMENTRY) ?
1115                              "VM-Entry failure" : "Unhandled VM-Exit",
1116                              (u16)reason);
1117                 dump_vm_exit_details(reason);
1118                 break;
1119         }
1120         dump_guest_regs(&cpu_data->guest_regs);
1121         panic_park();
1122 }
1123
1124 void vmx_entry_failure(void)
1125 {
1126         panic_printk("FATAL: vmresume failed, error %d\n",
1127                      vmcs_read32(VM_INSTRUCTION_ERROR));
1128         panic_stop();
1129 }
1130
1131 void vcpu_vendor_get_cell_io_bitmap(struct cell *cell,
1132                                     struct vcpu_io_bitmap *iobm)
1133 {
1134         iobm->data = cell->vmx.io_bitmap;
1135         iobm->size = sizeof(cell->vmx.io_bitmap);
1136 }
1137
1138 void vcpu_vendor_get_execution_state(struct vcpu_execution_state *x_state)
1139 {
1140         x_state->efer = vmcs_read64(GUEST_IA32_EFER);
1141         x_state->rflags = vmcs_read64(GUEST_RFLAGS);
1142         x_state->cs = vmcs_read16(GUEST_CS_SELECTOR);
1143         x_state->rip = vmcs_read64(GUEST_RIP);
1144 }
1145
1146 void enable_irq(void)
1147 {
1148         asm volatile("sti" : : : "memory");
1149 }
1150
1151 void disable_irq(void)
1152 {
1153         asm volatile("cli" : : : "memory");
1154 }