]> rtime.felk.cvut.cz Git - jailhouse.git/blob - hypervisor/arch/x86/vmx.c
core: Add support for per-CPU statistics
[jailhouse.git] / hypervisor / arch / x86 / vmx.c
1 /*
2  * Jailhouse, a Linux-based partitioning hypervisor
3  *
4  * Copyright (c) Siemens AG, 2013
5  *
6  * Authors:
7  *  Jan Kiszka <jan.kiszka@siemens.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2.  See
10  * the COPYING file in the top-level directory.
11  */
12
13 #include <jailhouse/entry.h>
14 #include <jailhouse/paging.h>
15 #include <jailhouse/processor.h>
16 #include <jailhouse/printk.h>
17 #include <jailhouse/string.h>
18 #include <jailhouse/control.h>
19 #include <jailhouse/hypercall.h>
20 #include <jailhouse/mmio.h>
21 #include <jailhouse/pci.h>
22 #include <asm/apic.h>
23 #include <asm/control.h>
24 #include <asm/io.h>
25 #include <asm/ioapic.h>
26 #include <asm/pci.h>
27 #include <asm/vmx.h>
28 #include <asm/vtd.h>
29
30 static const struct segment invalid_seg = {
31         .access_rights = 0x10000
32 };
33
34 static u8 __attribute__((aligned(PAGE_SIZE))) msr_bitmap[][0x2000/8] = {
35         [ VMX_MSR_BMP_0000_READ ] = {
36                 [      0/8 ...  0x7ff/8 ] = 0,
37                 [  0x800/8 ...  0x807/8 ] = 0x0c, /* 0x802, 0x803 */
38                 [  0x808/8 ...  0x80f/8 ] = 0xa5, /* 0x808, 0x80a, 0x80d */
39                 [  0x810/8 ...  0x817/8 ] = 0xff, /* 0x810 - 0x817 */
40                 [  0x818/8 ...  0x81f/8 ] = 0xff, /* 0x818 - 0x81f */
41                 [  0x820/8 ...  0x827/8 ] = 0xff, /* 0x820 - 0x827 */
42                 [  0x828/8 ...  0x82f/8 ] = 0x81, /* 0x828, 0x82f */
43                 [  0x830/8 ...  0x837/8 ] = 0xfd, /* 0x830, 0x832 - 0x837 */
44                 [  0x838/8 ...  0x83f/8 ] = 0x43, /* 0x838, 0x839, 0x83e */
45                 [  0x840/8 ... 0x1fff/8 ] = 0,
46         },
47         [ VMX_MSR_BMP_C000_READ ] = {
48                 [      0/8 ... 0x1fff/8 ] = 0,
49         },
50         [ VMX_MSR_BMP_0000_WRITE ] = {
51                 [      0/8 ...  0x807/8 ] = 0,
52                 [  0x808/8 ...  0x80f/8 ] = 0x89, /* 0x808, 0x80b, 0x80f */
53                 [  0x810/8 ...  0x827/8 ] = 0,
54                 [  0x828/8 ...  0x82f/8 ] = 0x81, /* 0x828, 0x82f */
55                 [  0x830/8 ...  0x837/8 ] = 0xfd, /* 0x830, 0x832 - 0x837 */
56                 [  0x838/8 ...  0x83f/8 ] = 0xc1, /* 0x838, 0x83e, 0x83f */
57                 [  0x840/8 ... 0x1fff/8 ] = 0,
58         },
59         [ VMX_MSR_BMP_C000_WRITE ] = {
60                 [      0/8 ... 0x1fff/8 ] = 0,
61         },
62 };
63 static u8 __attribute__((aligned(PAGE_SIZE))) apic_access_page[PAGE_SIZE];
64 static struct paging ept_paging[EPT_PAGE_DIR_LEVELS];
65
66 static bool vmxon(struct per_cpu *cpu_data)
67 {
68         unsigned long vmxon_addr;
69         u8 ok;
70
71         vmxon_addr = page_map_hvirt2phys(&cpu_data->vmxon_region);
72         asm volatile(
73                 "vmxon (%1)\n\t"
74                 "seta %0"
75                 : "=rm" (ok)
76                 : "r" (&vmxon_addr), "m" (vmxon_addr)
77                 : "memory", "cc");
78         return ok;
79 }
80
81 static bool vmcs_clear(struct per_cpu *cpu_data)
82 {
83         unsigned long vmcs_addr = page_map_hvirt2phys(&cpu_data->vmcs);
84         u8 ok;
85
86         asm volatile(
87                 "vmclear (%1)\n\t"
88                 "seta %0"
89                 : "=qm" (ok)
90                 : "r" (&vmcs_addr), "m" (vmcs_addr)
91                 : "memory", "cc");
92         return ok;
93 }
94
95 static bool vmcs_load(struct per_cpu *cpu_data)
96 {
97         unsigned long vmcs_addr = page_map_hvirt2phys(&cpu_data->vmcs);
98         u8 ok;
99
100         asm volatile(
101                 "vmptrld (%1)\n\t"
102                 "seta %0"
103                 : "=qm" (ok)
104                 : "r" (&vmcs_addr), "m" (vmcs_addr)
105                 : "memory", "cc");
106         return ok;
107 }
108
109 static inline unsigned long vmcs_read64(unsigned long field)
110 {
111         unsigned long value;
112
113         asm volatile("vmread %1,%0" : "=r" (value) : "r" (field) : "cc");
114         return value;
115 }
116
117 static inline u16 vmcs_read16(unsigned long field)
118 {
119         return vmcs_read64(field);
120 }
121
122 static inline u32 vmcs_read32(unsigned long field)
123 {
124         return vmcs_read64(field);
125 }
126
127 static bool vmcs_write64(unsigned long field, unsigned long val)
128 {
129         u8 ok;
130
131         asm volatile(
132                 "vmwrite %1,%2\n\t"
133                 "setnz %0"
134                 : "=qm" (ok)
135                 : "r" (val), "r" (field)
136                 : "cc");
137         if (!ok)
138                 printk("FATAL: vmwrite %08lx failed, error %d, caller %p\n",
139                        field, vmcs_read32(VM_INSTRUCTION_ERROR),
140                        __builtin_return_address(0));
141         return ok;
142 }
143
144 static bool vmcs_write16(unsigned long field, u16 value)
145 {
146         return vmcs_write64(field, value);
147 }
148
149 static bool vmcs_write32(unsigned long field, u32 value)
150 {
151         return vmcs_write64(field, value);
152 }
153
154 static int vmx_check_features(void)
155 {
156         unsigned long vmx_proc_ctrl, vmx_proc_ctrl2, ept_cap;
157         unsigned long vmx_pin_ctrl, vmx_basic;
158
159         if (!(cpuid_ecx(1) & X86_FEATURE_VMX))
160                 return -ENODEV;
161
162         vmx_basic = read_msr(MSR_IA32_VMX_BASIC);
163
164         /* require VMCS size <= PAGE_SIZE,
165          * VMCS memory access type == write back and
166          * availability of TRUE_*_CTLS */
167         if (((vmx_basic >> 32) & 0x1fff) > PAGE_SIZE ||
168             ((vmx_basic >> 50) & 0xf) != EPT_TYPE_WRITEBACK ||
169             !(vmx_basic & (1UL << 55)))
170                 return -EIO;
171
172         /* require NMI exiting and preemption timer support */
173         vmx_pin_ctrl = read_msr(MSR_IA32_VMX_PINBASED_CTLS) >> 32;
174         if (!(vmx_pin_ctrl & PIN_BASED_NMI_EXITING) ||
175             !(vmx_pin_ctrl & PIN_BASED_VMX_PREEMPTION_TIMER))
176                 return -EIO;
177
178         /* require I/O and MSR bitmap as well as secondary controls support */
179         vmx_proc_ctrl = read_msr(MSR_IA32_VMX_PROCBASED_CTLS) >> 32;
180         if (!(vmx_proc_ctrl & CPU_BASED_USE_IO_BITMAPS) ||
181             !(vmx_proc_ctrl & CPU_BASED_USE_MSR_BITMAPS) ||
182             !(vmx_proc_ctrl & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
183                 return -EIO;
184
185         /* require disabling of CR3 access interception */
186         vmx_proc_ctrl = read_msr(MSR_IA32_VMX_TRUE_PROCBASED_CTLS);
187         if (vmx_proc_ctrl &
188             (CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING))
189                 return -EIO;
190
191         /* require APIC access, EPT and unrestricted guest mode support */
192         vmx_proc_ctrl2 = read_msr(MSR_IA32_VMX_PROCBASED_CTLS2) >> 32;
193         ept_cap = read_msr(MSR_IA32_VMX_EPT_VPID_CAP);
194         if (!(vmx_proc_ctrl2 & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) ||
195             !(vmx_proc_ctrl2 & SECONDARY_EXEC_ENABLE_EPT) ||
196             (ept_cap & EPT_MANDATORY_FEATURES) != EPT_MANDATORY_FEATURES ||
197             !(ept_cap & (EPT_INVEPT_SINGLE | EPT_INVEPT_GLOBAL)) ||
198             !(vmx_proc_ctrl2 & SECONDARY_EXEC_UNRESTRICTED_GUEST))
199                 return -EIO;
200
201         /* require activity state HLT */
202         if (!(read_msr(MSR_IA32_VMX_MISC) & VMX_MISC_ACTIVITY_HLT))
203                 return -EIO;
204
205         return 0;
206 }
207
208 static void ept_set_next_pt(pt_entry_t pte, unsigned long next_pt)
209 {
210         *pte = (next_pt & 0x000ffffffffff000UL) | EPT_FLAG_READ |
211                 EPT_FLAG_WRITE | EPT_FLAG_EXECUTE;
212 }
213
214 int vmx_init(void)
215 {
216         unsigned int n;
217         int err;
218
219         err = vmx_check_features();
220         if (err)
221                 return err;
222
223         /* derive ept_paging from very similar x86_64_paging */
224         memcpy(ept_paging, x86_64_paging, sizeof(ept_paging));
225         for (n = 0; n < EPT_PAGE_DIR_LEVELS; n++)
226                 ept_paging[n].set_next_pt = ept_set_next_pt;
227         if (!(read_msr(MSR_IA32_VMX_EPT_VPID_CAP) & EPT_1G_PAGES))
228                 ept_paging[1].page_size = 0;
229         if (!(read_msr(MSR_IA32_VMX_EPT_VPID_CAP) & EPT_2M_PAGES))
230                 ept_paging[2].page_size = 0;
231
232         if (using_x2apic) {
233                 /* allow direct x2APIC access except for ICR writes */
234                 memset(&msr_bitmap[VMX_MSR_BMP_0000_READ][MSR_X2APIC_BASE/8],
235                        0, (MSR_X2APIC_END - MSR_X2APIC_BASE + 1)/8);
236                 memset(&msr_bitmap[VMX_MSR_BMP_0000_WRITE][MSR_X2APIC_BASE/8],
237                        0, (MSR_X2APIC_END - MSR_X2APIC_BASE + 1)/8);
238                 msr_bitmap[VMX_MSR_BMP_0000_WRITE][MSR_X2APIC_ICR/8] = 0x01;
239         }
240
241         return vmx_cell_init(&root_cell);
242 }
243
244 unsigned long arch_page_map_gphys2phys(struct per_cpu *cpu_data,
245                                        unsigned long gphys)
246 {
247         return page_map_virt2phys(&cpu_data->cell->vmx.ept_structs, gphys);
248 }
249
250 int vmx_cell_init(struct cell *cell)
251 {
252         const u8 *pio_bitmap = jailhouse_cell_pio_bitmap(cell->config);
253         u32 pio_bitmap_size = cell->config->pio_bitmap_size;
254         int n, err;
255         u32 size;
256         u8 *b;
257
258         /* build root EPT of cell */
259         cell->vmx.ept_structs.root_paging = ept_paging;
260         cell->vmx.ept_structs.root_table = page_alloc(&mem_pool, 1);
261         if (!cell->vmx.ept_structs.root_table)
262                 return -ENOMEM;
263
264         err = page_map_create(&cell->vmx.ept_structs,
265                               page_map_hvirt2phys(apic_access_page),
266                               PAGE_SIZE, XAPIC_BASE,
267                               EPT_FLAG_READ|EPT_FLAG_WRITE|EPT_FLAG_WB_TYPE,
268                               PAGE_MAP_NON_COHERENT);
269         if (err) {
270                 vmx_cell_exit(cell);
271                 return err;
272         }
273
274         memset(cell->vmx.io_bitmap, -1, sizeof(cell->vmx.io_bitmap));
275
276         for (n = 0; n < 2; n++) {
277                 size = pio_bitmap_size <= PAGE_SIZE ?
278                         pio_bitmap_size : PAGE_SIZE;
279                 memcpy(cell->vmx.io_bitmap + n * PAGE_SIZE, pio_bitmap, size);
280                 pio_bitmap += size;
281                 pio_bitmap_size -= size;
282         }
283
284         if (cell != &root_cell) {
285                 /*
286                  * Shrink PIO access of root cell corresponding to new cell's
287                  * access rights.
288                  */
289                 pio_bitmap = jailhouse_cell_pio_bitmap(cell->config);
290                 pio_bitmap_size = cell->config->pio_bitmap_size;
291                 for (b = root_cell.vmx.io_bitmap; pio_bitmap_size > 0;
292                      b++, pio_bitmap++, pio_bitmap_size--)
293                         *b |= ~*pio_bitmap;
294
295                 vmx_invept();
296         }
297
298         return 0;
299 }
300
301 int vmx_map_memory_region(struct cell *cell,
302                           const struct jailhouse_memory *mem)
303 {
304         u64 phys_start = mem->phys_start;
305         u32 flags = EPT_FLAG_WB_TYPE;
306
307         if (mem->flags & JAILHOUSE_MEM_READ)
308                 flags |= EPT_FLAG_READ;
309         if (mem->flags & JAILHOUSE_MEM_WRITE)
310                 flags |= EPT_FLAG_WRITE;
311         if (mem->flags & JAILHOUSE_MEM_EXECUTE)
312                 flags |= EPT_FLAG_EXECUTE;
313         if (mem->flags & JAILHOUSE_MEM_COMM_REGION)
314                 phys_start = page_map_hvirt2phys(&cell->comm_page);
315
316         return page_map_create(&cell->vmx.ept_structs, phys_start, mem->size,
317                                mem->virt_start, flags, PAGE_MAP_NON_COHERENT);
318 }
319
320 int vmx_unmap_memory_region(struct cell *cell,
321                             const struct jailhouse_memory *mem)
322 {
323         return page_map_destroy(&cell->vmx.ept_structs, mem->virt_start,
324                                 mem->size, PAGE_MAP_NON_COHERENT);
325 }
326
327 void vmx_cell_exit(struct cell *cell)
328 {
329         const u8 *root_pio_bitmap =
330                 jailhouse_cell_pio_bitmap(root_cell.config);
331         const u8 *pio_bitmap = jailhouse_cell_pio_bitmap(cell->config);
332         u32 pio_bitmap_size = cell->config->pio_bitmap_size;
333         u8 *b;
334
335         page_map_destroy(&cell->vmx.ept_structs, XAPIC_BASE, PAGE_SIZE,
336                          PAGE_MAP_NON_COHERENT);
337
338         if (root_cell.config->pio_bitmap_size < pio_bitmap_size)
339                 pio_bitmap_size = root_cell.config->pio_bitmap_size;
340
341         for (b = root_cell.vmx.io_bitmap; pio_bitmap_size > 0;
342              b++, pio_bitmap++, root_pio_bitmap++, pio_bitmap_size--)
343                 *b &= *pio_bitmap | *root_pio_bitmap;
344
345         page_free(&mem_pool, cell->vmx.ept_structs.root_table, 1);
346 }
347
348 void vmx_invept(void)
349 {
350         unsigned long ept_cap = read_msr(MSR_IA32_VMX_EPT_VPID_CAP);
351         struct {
352                 u64 eptp;
353                 u64 reserved;
354         } descriptor;
355         u64 type;
356         u8 ok;
357
358         descriptor.reserved = 0;
359         if (ept_cap & EPT_INVEPT_SINGLE) {
360                 type = VMX_INVEPT_SINGLE;
361                 descriptor.eptp = vmcs_read64(EPT_POINTER);
362         } else {
363                 type = VMX_INVEPT_GLOBAL;
364                 descriptor.eptp = 0;
365         }
366         asm volatile(
367                 "invept (%1),%2\n\t"
368                 "seta %0\n\t"
369                 : "=qm" (ok)
370                 : "r" (&descriptor), "r" (type)
371                 : "memory", "cc");
372
373         if (!ok) {
374                 panic_printk("FATAL: invept failed, error %d\n",
375                              vmcs_read32(VM_INSTRUCTION_ERROR));
376                 panic_stop(NULL);
377         }
378 }
379
380 static bool vmx_set_guest_cr(int cr, unsigned long val)
381 {
382         unsigned long fixed0, fixed1, required1;
383         bool ok = true;
384
385         fixed0 = read_msr(cr ? MSR_IA32_VMX_CR4_FIXED0
386                              : MSR_IA32_VMX_CR0_FIXED0);
387         fixed1 = read_msr(cr ? MSR_IA32_VMX_CR4_FIXED1
388                              : MSR_IA32_VMX_CR0_FIXED1);
389         required1 = fixed0 & fixed1;
390         if (cr == 0) {
391                 fixed1 &= ~(X86_CR0_NW | X86_CR0_CD);
392                 required1 &= ~(X86_CR0_PE | X86_CR0_PG);
393                 required1 |= X86_CR0_ET;
394         } else {
395                 /* keeps the hypervisor visible */
396                 val |= X86_CR4_VMXE;
397         }
398         ok &= vmcs_write64(cr ? GUEST_CR4 : GUEST_CR0,
399                            (val & fixed1) | required1);
400         ok &= vmcs_write64(cr ? CR4_READ_SHADOW : CR0_READ_SHADOW, val);
401         ok &= vmcs_write64(cr ? CR4_GUEST_HOST_MASK : CR0_GUEST_HOST_MASK,
402                            required1 | ~fixed1);
403
404         return ok;
405 }
406
407 static bool vmx_set_cell_config(struct cell *cell)
408 {
409         u8 *io_bitmap;
410         bool ok = true;
411
412         io_bitmap = cell->vmx.io_bitmap;
413         ok &= vmcs_write64(IO_BITMAP_A, page_map_hvirt2phys(io_bitmap));
414         ok &= vmcs_write64(IO_BITMAP_B,
415                            page_map_hvirt2phys(io_bitmap + PAGE_SIZE));
416
417         ok &= vmcs_write64(EPT_POINTER,
418                         page_map_hvirt2phys(cell->vmx.ept_structs.root_table) |
419                         EPT_TYPE_WRITEBACK | EPT_PAGE_WALK_LEN);
420
421         return ok;
422 }
423
424 static bool vmx_set_guest_segment(const struct segment *seg,
425                                   unsigned long selector_field)
426 {
427         bool ok = true;
428
429         ok &= vmcs_write16(selector_field, seg->selector);
430         ok &= vmcs_write64(selector_field + GUEST_SEG_BASE, seg->base);
431         ok &= vmcs_write32(selector_field + GUEST_SEG_LIMIT, seg->limit);
432         ok &= vmcs_write32(selector_field + GUEST_SEG_AR_BYTES,
433                            seg->access_rights);
434         return ok;
435 }
436
437 static bool vmcs_setup(struct per_cpu *cpu_data)
438 {
439         struct desc_table_reg dtr;
440         unsigned long val;
441         bool ok = true;
442
443         ok &= vmcs_write64(HOST_CR0, read_cr0());
444         ok &= vmcs_write64(HOST_CR3, read_cr3());
445         ok &= vmcs_write64(HOST_CR4, read_cr4());
446
447         ok &= vmcs_write16(HOST_CS_SELECTOR, GDT_DESC_CODE * 8);
448         ok &= vmcs_write16(HOST_DS_SELECTOR, 0);
449         ok &= vmcs_write16(HOST_ES_SELECTOR, 0);
450         ok &= vmcs_write16(HOST_SS_SELECTOR, 0);
451         ok &= vmcs_write16(HOST_FS_SELECTOR, 0);
452         ok &= vmcs_write16(HOST_GS_SELECTOR, 0);
453         ok &= vmcs_write16(HOST_TR_SELECTOR, GDT_DESC_TSS * 8);
454
455         ok &= vmcs_write64(HOST_FS_BASE, 0);
456         ok &= vmcs_write64(HOST_GS_BASE, 0);
457         ok &= vmcs_write64(HOST_TR_BASE, 0);
458
459         read_gdtr(&dtr);
460         ok &= vmcs_write64(HOST_GDTR_BASE, dtr.base);
461         read_idtr(&dtr);
462         ok &= vmcs_write64(HOST_IDTR_BASE, dtr.base);
463
464         ok &= vmcs_write64(HOST_IA32_EFER, EFER_LMA | EFER_LME);
465
466         ok &= vmcs_write32(HOST_IA32_SYSENTER_CS, 0);
467         ok &= vmcs_write64(HOST_IA32_SYSENTER_EIP, 0);
468         ok &= vmcs_write64(HOST_IA32_SYSENTER_ESP, 0);
469
470         ok &= vmcs_write64(HOST_RSP, (unsigned long)cpu_data->stack +
471                            sizeof(cpu_data->stack));
472         ok &= vmcs_write64(HOST_RIP, (unsigned long)vm_exit);
473
474         ok &= vmx_set_guest_cr(0, read_cr0());
475         ok &= vmx_set_guest_cr(4, read_cr4());
476
477         ok &= vmcs_write64(GUEST_CR3, cpu_data->linux_cr3);
478
479         ok &= vmx_set_guest_segment(&cpu_data->linux_cs, GUEST_CS_SELECTOR);
480         ok &= vmx_set_guest_segment(&cpu_data->linux_ds, GUEST_DS_SELECTOR);
481         ok &= vmx_set_guest_segment(&cpu_data->linux_es, GUEST_ES_SELECTOR);
482         ok &= vmx_set_guest_segment(&cpu_data->linux_fs, GUEST_FS_SELECTOR);
483         ok &= vmx_set_guest_segment(&cpu_data->linux_gs, GUEST_GS_SELECTOR);
484         ok &= vmx_set_guest_segment(&invalid_seg, GUEST_SS_SELECTOR);
485         ok &= vmx_set_guest_segment(&cpu_data->linux_tss, GUEST_TR_SELECTOR);
486         ok &= vmx_set_guest_segment(&invalid_seg, GUEST_LDTR_SELECTOR);
487
488         ok &= vmcs_write64(GUEST_GDTR_BASE, cpu_data->linux_gdtr.base);
489         ok &= vmcs_write32(GUEST_GDTR_LIMIT, cpu_data->linux_gdtr.limit);
490         ok &= vmcs_write64(GUEST_IDTR_BASE, cpu_data->linux_idtr.base);
491         ok &= vmcs_write32(GUEST_IDTR_LIMIT, cpu_data->linux_idtr.limit);
492
493         ok &= vmcs_write64(GUEST_RFLAGS, 0x02);
494         ok &= vmcs_write64(GUEST_RSP, cpu_data->linux_sp +
495                            (NUM_ENTRY_REGS + 1) * sizeof(unsigned long));
496         ok &= vmcs_write64(GUEST_RIP, cpu_data->linux_ip);
497
498         ok &= vmcs_write32(GUEST_SYSENTER_CS,
499                            read_msr(MSR_IA32_SYSENTER_CS));
500         ok &= vmcs_write64(GUEST_SYSENTER_EIP,
501                            read_msr(MSR_IA32_SYSENTER_EIP));
502         ok &= vmcs_write64(GUEST_SYSENTER_ESP,
503                            read_msr(MSR_IA32_SYSENTER_ESP));
504
505         ok &= vmcs_write64(GUEST_DR7, 0x00000400);
506         ok &= vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
507
508         ok &= vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
509         ok &= vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
510         ok &= vmcs_write64(GUEST_PENDING_DBG_EXCEPTIONS, 0);
511
512         ok &= vmcs_write64(GUEST_IA32_EFER, cpu_data->linux_efer);
513
514         // TODO: switch PAT, PERF */
515
516         ok &= vmcs_write64(VMCS_LINK_POINTER, -1UL);
517         ok &= vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
518
519         val = read_msr(MSR_IA32_VMX_PINBASED_CTLS);
520         val |= PIN_BASED_NMI_EXITING;
521         ok &= vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, val);
522
523         ok &= vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0);
524
525         val = read_msr(MSR_IA32_VMX_PROCBASED_CTLS);
526         val |= CPU_BASED_USE_IO_BITMAPS | CPU_BASED_USE_MSR_BITMAPS |
527                 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
528         val &= ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
529         ok &= vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, val);
530
531         ok &= vmcs_write64(MSR_BITMAP, page_map_hvirt2phys(msr_bitmap));
532
533         val = read_msr(MSR_IA32_VMX_PROCBASED_CTLS2);
534         val |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
535                 SECONDARY_EXEC_ENABLE_EPT | SECONDARY_EXEC_UNRESTRICTED_GUEST;
536         ok &= vmcs_write32(SECONDARY_VM_EXEC_CONTROL, val);
537
538         ok &= vmcs_write64(APIC_ACCESS_ADDR,
539                            page_map_hvirt2phys(apic_access_page));
540
541         ok &= vmx_set_cell_config(cpu_data->cell);
542
543         ok &= vmcs_write32(EXCEPTION_BITMAP, 0);
544
545         val = read_msr(MSR_IA32_VMX_EXIT_CTLS);
546         val |= VM_EXIT_HOST_ADDR_SPACE_SIZE | VM_EXIT_SAVE_IA32_EFER |
547                 VM_EXIT_LOAD_IA32_EFER;
548         ok &= vmcs_write32(VM_EXIT_CONTROLS, val);
549
550         ok &= vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
551         ok &= vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
552         ok &= vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
553
554         val = read_msr(MSR_IA32_VMX_ENTRY_CTLS);
555         val |= VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER;
556         ok &= vmcs_write32(VM_ENTRY_CONTROLS, val);
557
558         ok &= vmcs_write64(CR4_GUEST_HOST_MASK, 0);
559
560         ok &= vmcs_write32(CR3_TARGET_COUNT, 0);
561
562         return ok;
563 }
564
565 int vmx_cpu_init(struct per_cpu *cpu_data)
566 {
567         unsigned long cr4, feature_ctrl, mask;
568         u32 revision_id;
569         int err;
570
571         cr4 = read_cr4();
572         if (cr4 & X86_CR4_VMXE)
573                 return -EBUSY;
574
575         err = vmx_check_features();
576         if (err)
577                 return err;
578
579         revision_id = (u32)read_msr(MSR_IA32_VMX_BASIC);
580         cpu_data->vmxon_region.revision_id = revision_id;
581         cpu_data->vmxon_region.shadow_indicator = 0;
582         cpu_data->vmcs.revision_id = revision_id;
583         cpu_data->vmcs.shadow_indicator = 0;
584
585         // TODO: validate CR0
586
587         /* Note: We assume that TXT is off */
588         feature_ctrl = read_msr(MSR_IA32_FEATURE_CONTROL);
589         mask = FEATURE_CONTROL_LOCKED |
590                 FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
591
592         if ((feature_ctrl & mask) != mask) {
593                 if (feature_ctrl & FEATURE_CONTROL_LOCKED)
594                         return -ENODEV;
595
596                 feature_ctrl |= mask;
597                 write_msr(MSR_IA32_FEATURE_CONTROL, feature_ctrl);
598         }
599
600         write_cr4(cr4 | X86_CR4_VMXE);
601         // TODO: validate CR4
602
603         if (!vmxon(cpu_data))  {
604                 write_cr4(cr4);
605                 return -EIO;
606         }
607
608         cpu_data->vmx_state = VMXON;
609
610         if (!vmcs_clear(cpu_data) ||
611             !vmcs_load(cpu_data) ||
612             !vmcs_setup(cpu_data))
613                 return -EIO;
614
615         cpu_data->vmx_state = VMCS_READY;
616
617         return 0;
618 }
619
620 void vmx_cpu_exit(struct per_cpu *cpu_data)
621 {
622         if (cpu_data->vmx_state == VMXOFF)
623                 return;
624
625         cpu_data->vmx_state = VMXOFF;
626         /* Write vmx_state to ensure that vmx_schedule_vmexit stops accessing
627          * the VMCS (a compiler barrier would be sufficient, in fact). */
628         memory_barrier();
629
630         vmcs_clear(cpu_data);
631         asm volatile("vmxoff" : : : "cc");
632         write_cr4(read_cr4() & ~X86_CR4_VMXE);
633 }
634
635 void vmx_cpu_activate_vmm(struct per_cpu *cpu_data)
636 {
637         /* We enter Linux at the point arch_entry would return to as well.
638          * rax is cleared to signal success to the caller. */
639         asm volatile(
640                 "mov (%%rdi),%%r15\n\t"
641                 "mov 0x8(%%rdi),%%r14\n\t"
642                 "mov 0x10(%%rdi),%%r13\n\t"
643                 "mov 0x18(%%rdi),%%r12\n\t"
644                 "mov 0x20(%%rdi),%%rbx\n\t"
645                 "mov 0x28(%%rdi),%%rbp\n\t"
646                 "vmlaunch\n\t"
647                 "pop %%rbp"
648                 : /* no output */
649                 : "a" (0), "D" (cpu_data->linux_reg)
650                 : "memory", "r15", "r14", "r13", "r12", "rbx", "rbp", "cc");
651
652         panic_printk("FATAL: vmlaunch failed, error %d\n",
653                      vmcs_read32(VM_INSTRUCTION_ERROR));
654         panic_stop(cpu_data);
655 }
656
657 static void __attribute__((noreturn))
658 vmx_cpu_deactivate_vmm(struct registers *guest_regs, struct per_cpu *cpu_data)
659 {
660         unsigned long *stack = (unsigned long *)vmcs_read64(GUEST_RSP);
661         unsigned long linux_ip = vmcs_read64(GUEST_RIP);
662
663         cpu_data->linux_cr3 = vmcs_read64(GUEST_CR3);
664
665         cpu_data->linux_gdtr.base = vmcs_read64(GUEST_GDTR_BASE);
666         cpu_data->linux_gdtr.limit = vmcs_read64(GUEST_GDTR_LIMIT);
667         cpu_data->linux_idtr.base = vmcs_read64(GUEST_IDTR_BASE);
668         cpu_data->linux_idtr.limit = vmcs_read64(GUEST_IDTR_LIMIT);
669
670         cpu_data->linux_cs.selector = vmcs_read32(GUEST_CS_SELECTOR);
671
672         cpu_data->linux_tss.selector = vmcs_read32(GUEST_TR_SELECTOR);
673
674         cpu_data->linux_efer = vmcs_read64(GUEST_IA32_EFER);
675         cpu_data->linux_fs.base = vmcs_read64(GUEST_FS_BASE);
676         cpu_data->linux_gs.base = vmcs_read64(GUEST_GS_BASE);
677
678         cpu_data->linux_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
679         cpu_data->linux_sysenter_eip = vmcs_read64(GUEST_SYSENTER_EIP);
680         cpu_data->linux_sysenter_esp = vmcs_read64(GUEST_SYSENTER_ESP);
681
682         cpu_data->linux_ds.selector = vmcs_read16(GUEST_DS_SELECTOR);
683         cpu_data->linux_es.selector = vmcs_read16(GUEST_ES_SELECTOR);
684         cpu_data->linux_fs.selector = vmcs_read16(GUEST_FS_SELECTOR);
685         cpu_data->linux_gs.selector = vmcs_read16(GUEST_GS_SELECTOR);
686
687         arch_cpu_restore(cpu_data);
688
689         stack--;
690         *stack = linux_ip;
691
692         asm volatile (
693                 "mov %%rbx,%%rsp\n\t"
694                 "pop %%r15\n\t"
695                 "pop %%r14\n\t"
696                 "pop %%r13\n\t"
697                 "pop %%r12\n\t"
698                 "pop %%r11\n\t"
699                 "pop %%r10\n\t"
700                 "pop %%r9\n\t"
701                 "pop %%r8\n\t"
702                 "pop %%rdi\n\t"
703                 "pop %%rsi\n\t"
704                 "pop %%rbp\n\t"
705                 "add $8,%%rsp\n\t"
706                 "pop %%rbx\n\t"
707                 "pop %%rdx\n\t"
708                 "pop %%rcx\n\t"
709                 "mov %%rax,%%rsp\n\t"
710                 "xor %%rax,%%rax\n\t"
711                 "ret"
712                 : : "a" (stack), "b" (guest_regs));
713         __builtin_unreachable();
714 }
715
716 static void vmx_cpu_reset(struct per_cpu *cpu_data, unsigned int sipi_vector)
717 {
718         unsigned long val;
719         bool ok = true;
720
721         ok &= vmx_set_guest_cr(0, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
722         ok &= vmx_set_guest_cr(4, 0);
723
724         ok &= vmcs_write64(GUEST_CR3, 0);
725
726         ok &= vmcs_write64(GUEST_RFLAGS, 0x02);
727         ok &= vmcs_write64(GUEST_RSP, 0);
728
729         val = 0;
730         if (sipi_vector == APIC_BSP_PSEUDO_SIPI) {
731                 val = 0xfff0;
732                 sipi_vector = 0xf0;
733         }
734         ok &= vmcs_write64(GUEST_RIP, val);
735
736         ok &= vmcs_write16(GUEST_CS_SELECTOR, sipi_vector << 8);
737         ok &= vmcs_write64(GUEST_CS_BASE, sipi_vector << 12);
738         ok &= vmcs_write32(GUEST_CS_LIMIT, 0xffff);
739         ok &= vmcs_write32(GUEST_CS_AR_BYTES, 0x0009b);
740
741         ok &= vmcs_write16(GUEST_DS_SELECTOR, 0);
742         ok &= vmcs_write64(GUEST_DS_BASE, 0);
743         ok &= vmcs_write32(GUEST_DS_LIMIT, 0xffff);
744         ok &= vmcs_write32(GUEST_DS_AR_BYTES, 0x00093);
745
746         ok &= vmcs_write16(GUEST_ES_SELECTOR, 0);
747         ok &= vmcs_write64(GUEST_ES_BASE, 0);
748         ok &= vmcs_write32(GUEST_ES_LIMIT, 0xffff);
749         ok &= vmcs_write32(GUEST_ES_AR_BYTES, 0x00093);
750
751         ok &= vmcs_write16(GUEST_FS_SELECTOR, 0);
752         ok &= vmcs_write64(GUEST_FS_BASE, 0);
753         ok &= vmcs_write32(GUEST_FS_LIMIT, 0xffff);
754         ok &= vmcs_write32(GUEST_FS_AR_BYTES, 0x00093);
755
756         ok &= vmcs_write16(GUEST_GS_SELECTOR, 0);
757         ok &= vmcs_write64(GUEST_GS_BASE, 0);
758         ok &= vmcs_write32(GUEST_GS_LIMIT, 0xffff);
759         ok &= vmcs_write32(GUEST_GS_AR_BYTES, 0x00093);
760
761         ok &= vmcs_write16(GUEST_SS_SELECTOR, 0);
762         ok &= vmcs_write64(GUEST_SS_BASE, 0);
763         ok &= vmcs_write32(GUEST_SS_LIMIT, 0xffff);
764         ok &= vmcs_write32(GUEST_SS_AR_BYTES, 0x00093);
765
766         ok &= vmcs_write16(GUEST_TR_SELECTOR, 0);
767         ok &= vmcs_write64(GUEST_TR_BASE, 0);
768         ok &= vmcs_write32(GUEST_TR_LIMIT, 0xffff);
769         ok &= vmcs_write32(GUEST_TR_AR_BYTES, 0x0008b);
770
771         ok &= vmcs_write16(GUEST_LDTR_SELECTOR, 0);
772         ok &= vmcs_write64(GUEST_LDTR_BASE, 0);
773         ok &= vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
774         ok &= vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
775
776         ok &= vmcs_write64(GUEST_GDTR_BASE, 0);
777         ok &= vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
778         ok &= vmcs_write64(GUEST_IDTR_BASE, 0);
779         ok &= vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
780
781         ok &= vmcs_write64(GUEST_IA32_EFER, 0);
782
783         ok &= vmcs_write32(GUEST_SYSENTER_CS, 0);
784         ok &= vmcs_write64(GUEST_SYSENTER_EIP, 0);
785         ok &= vmcs_write64(GUEST_SYSENTER_ESP, 0);
786
787         ok &= vmcs_write64(GUEST_DR7, 0x00000400);
788         ok &= vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
789
790         ok &= vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
791         ok &= vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
792         ok &= vmcs_write64(GUEST_PENDING_DBG_EXCEPTIONS, 0);
793
794         val = vmcs_read32(VM_ENTRY_CONTROLS);
795         val &= ~VM_ENTRY_IA32E_MODE;
796         ok &= vmcs_write32(VM_ENTRY_CONTROLS, val);
797
798         ok &= vmx_set_cell_config(cpu_data->cell);
799
800         if (!ok) {
801                 panic_printk("FATAL: CPU reset failed\n");
802                 panic_stop(cpu_data);
803         }
804 }
805
806 void vmx_schedule_vmexit(struct per_cpu *cpu_data)
807 {
808         u32 pin_based_ctrl;
809
810         if (cpu_data->vmx_state != VMCS_READY)
811                 return;
812
813         pin_based_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
814         pin_based_ctrl |= PIN_BASED_VMX_PREEMPTION_TIMER;
815         vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, pin_based_ctrl);
816 }
817
818 void vmx_cpu_park(struct per_cpu *cpu_data)
819 {
820         vmx_cpu_reset(cpu_data, 0);
821         vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_HLT);
822 }
823
824 static void vmx_disable_preemption_timer(void)
825 {
826         u32 pin_based_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
827
828         pin_based_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
829         vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, pin_based_ctrl);
830 }
831
832 static void vmx_skip_emulated_instruction(unsigned int inst_len)
833 {
834         vmcs_write64(GUEST_RIP, vmcs_read64(GUEST_RIP) + inst_len);
835 }
836
837 static void update_efer(void)
838 {
839         unsigned long efer = vmcs_read64(GUEST_IA32_EFER);
840
841         if ((efer & (EFER_LME | EFER_LMA)) != EFER_LME)
842                 return;
843
844         efer |= EFER_LMA;
845         vmcs_write64(GUEST_IA32_EFER, efer);
846         vmcs_write32(VM_ENTRY_CONTROLS,
847                      vmcs_read32(VM_ENTRY_CONTROLS) | VM_ENTRY_IA32E_MODE);
848 }
849
850 static void vmx_handle_hypercall(struct registers *guest_regs,
851                                  struct per_cpu *cpu_data)
852 {
853         unsigned long code = guest_regs->rax;
854
855         vmx_skip_emulated_instruction(X86_INST_LEN_VMCALL);
856
857         if ((!(vmcs_read64(GUEST_IA32_EFER) & EFER_LMA) &&
858              vmcs_read64(GUEST_RFLAGS) & X86_RFLAGS_VM) ||
859             (vmcs_read16(GUEST_CS_SELECTOR) & 3) != 0) {
860                 guest_regs->rax = -EPERM;
861                 return;
862         }
863
864         guest_regs->rax = hypercall(cpu_data, code, guest_regs->rdi,
865                                     guest_regs->rsi);
866         if (guest_regs->rax == -ENOSYS)
867                 printk("CPU %d: Unknown vmcall %d, RIP: %p\n",
868                        cpu_data->cpu_id, code,
869                        vmcs_read64(GUEST_RIP) - X86_INST_LEN_VMCALL);
870
871         if (code == JAILHOUSE_HC_DISABLE && guest_regs->rax == 0)
872                 vmx_cpu_deactivate_vmm(guest_regs, cpu_data);
873 }
874
875 static bool vmx_handle_cr(struct registers *guest_regs,
876                           struct per_cpu *cpu_data)
877 {
878         u64 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
879         unsigned long cr, reg, val;
880
881         cr = exit_qualification & 0xf;
882         reg = (exit_qualification >> 8) & 0xf;
883
884         switch ((exit_qualification >> 4) & 3) {
885         case 0: /* move to cr */
886                 if (reg == 4)
887                         val = vmcs_read64(GUEST_RSP);
888                 else
889                         val = ((unsigned long *)guest_regs)[15 - reg];
890
891                 if (cr == 0 || cr == 4) {
892                         vmx_skip_emulated_instruction(X86_INST_LEN_MOV_TO_CR);
893                         /* TODO: check for #GP reasons */
894                         vmx_set_guest_cr(cr, val);
895                         if (cr == 0 && val & X86_CR0_PG)
896                                 update_efer();
897                         return true;
898                 }
899                 break;
900         default:
901                 break;
902         }
903         panic_printk("FATAL: Unhandled CR access, qualification %x\n",
904                      exit_qualification);
905         return false;
906 }
907
908 static bool
909 vmx_get_guest_paging_structs(struct guest_paging_structures *pg_structs)
910 {
911         if (vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE) {
912                 pg_structs->root_paging = x86_64_paging;
913                 pg_structs->root_table_gphys =
914                         vmcs_read64(GUEST_CR3) & 0x000ffffffffff000UL;
915         } else if (vmcs_read64(GUEST_CR0) & X86_CR0_PG &&
916                  !(vmcs_read64(GUEST_CR4) & X86_CR4_PAE)) {
917                 pg_structs->root_paging = i386_paging;
918                 pg_structs->root_table_gphys =
919                         vmcs_read64(GUEST_CR3) & 0xfffff000UL;
920         } else {
921                 printk("FATAL: Unsupported paging mode\n");
922                 return false;
923         }
924         return true;
925 }
926
927 static bool vmx_handle_apic_access(struct registers *guest_regs,
928                                    struct per_cpu *cpu_data)
929 {
930         struct guest_paging_structures pg_structs;
931         unsigned int inst_len, offset;
932         u64 qualification;
933         bool is_write;
934
935         qualification = vmcs_read64(EXIT_QUALIFICATION);
936
937         switch (qualification & APIC_ACCESS_TYPE_MASK) {
938         case APIC_ACCESS_TYPE_LINEAR_READ:
939         case APIC_ACCESS_TYPE_LINEAR_WRITE:
940                 is_write = !!(qualification & APIC_ACCESS_TYPE_LINEAR_WRITE);
941                 offset = qualification & APIC_ACCESS_OFFSET_MASK;
942                 if (offset & 0x00f)
943                         break;
944
945                 if (!vmx_get_guest_paging_structs(&pg_structs))
946                         break;
947
948                 inst_len = apic_mmio_access(guest_regs, cpu_data,
949                                             vmcs_read64(GUEST_RIP),
950                                             &pg_structs, offset >> 4,
951                                             is_write);
952                 if (!inst_len)
953                         break;
954
955                 vmx_skip_emulated_instruction(inst_len);
956                 return true;
957         }
958         panic_printk("FATAL: Unhandled APIC access, "
959                      "qualification %x\n", qualification);
960         return false;
961 }
962
963 static void dump_vm_exit_details(u32 reason)
964 {
965         panic_printk("qualification %x\n", vmcs_read64(EXIT_QUALIFICATION));
966         panic_printk("vectoring info: %x interrupt info: %x\n",
967                      vmcs_read32(IDT_VECTORING_INFO_FIELD),
968                      vmcs_read32(VM_EXIT_INTR_INFO));
969         if (reason == EXIT_REASON_EPT_VIOLATION ||
970             reason == EXIT_REASON_EPT_MISCONFIG)
971                 panic_printk("guest phys addr %p guest linear addr: %p\n",
972                              vmcs_read64(GUEST_PHYSICAL_ADDRESS),
973                              vmcs_read64(GUEST_LINEAR_ADDRESS));
974 }
975
976 static void dump_guest_regs(struct registers *guest_regs)
977 {
978         panic_printk("RIP: %p RSP: %p FLAGS: %x\n", vmcs_read64(GUEST_RIP),
979                      vmcs_read64(GUEST_RSP), vmcs_read64(GUEST_RFLAGS));
980         panic_printk("RAX: %p RBX: %p RCX: %p\n", guest_regs->rax,
981                      guest_regs->rbx, guest_regs->rcx);
982         panic_printk("RDX: %p RSI: %p RDI: %p\n", guest_regs->rdx,
983                      guest_regs->rsi, guest_regs->rdi);
984         panic_printk("CS: %x BASE: %p AR-BYTES: %x EFER.LMA %d\n",
985                      vmcs_read64(GUEST_CS_SELECTOR),
986                      vmcs_read64(GUEST_CS_BASE),
987                      vmcs_read32(GUEST_CS_AR_BYTES),
988                      !!(vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE));
989         panic_printk("CR0: %p CR3: %p CR4: %p\n", vmcs_read64(GUEST_CR0),
990                      vmcs_read64(GUEST_CR3), vmcs_read64(GUEST_CR4));
991         panic_printk("EFER: %p\n", vmcs_read64(GUEST_IA32_EFER));
992 }
993
994 static bool vmx_handle_io_access(struct registers *guest_regs,
995                                  struct per_cpu *cpu_data)
996 {
997         /* parse exit qualification for I/O instructions (see SDM, 27.2.1 ) */
998         u64 exitq = vmcs_read64(EXIT_QUALIFICATION);
999         u16 port = (exitq >> 16) & 0xFFFF;
1000         bool dir_in = (exitq & 0x8) >> 3;
1001         unsigned int size = (exitq & 0x3) + 1;
1002
1003         /* string and REP-prefixed instructions are not supported */
1004         if (exitq & 0x30)
1005                 goto invalid_access;
1006
1007         if (x86_pci_config_handler(guest_regs, cpu_data->cell, port, dir_in,
1008                                    size) == 1) {
1009                 vmx_skip_emulated_instruction(
1010                                 vmcs_read64(VM_EXIT_INSTRUCTION_LEN));
1011                 return true;
1012         }
1013
1014 invalid_access:
1015         panic_printk("FATAL: Invalid PIO %s, port: %x size: %d\n",
1016                      dir_in ? "read" : "write", port, size);
1017         panic_printk("PCI address port: %x\n",
1018                      cpu_data->cell->pci_addr_port_val);
1019         return false;
1020 }
1021
1022 static bool vmx_handle_ept_violation(struct registers *guest_regs,
1023                                      struct per_cpu *cpu_data)
1024 {
1025         u64 phys_addr = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
1026         u64 exitq = vmcs_read64(EXIT_QUALIFICATION);
1027         struct guest_paging_structures pg_structs;
1028         struct mmio_access access;
1029         int result = 0;
1030         bool is_write;
1031         u32 val;
1032
1033         /* We don't enable dirty/accessed bit updated in EPTP, so only read
1034          * of write flags can be set, not both. */
1035         is_write = !!(exitq & 0x2);
1036
1037         if (!vmx_get_guest_paging_structs(&pg_structs))
1038                 goto invalid_access;
1039
1040         access = mmio_parse(cpu_data, vmcs_read64(GUEST_RIP),
1041                             &pg_structs, is_write);
1042         if (!access.inst_len || access.size != 4)
1043                 goto invalid_access;
1044
1045         if (is_write)
1046                 val = ((unsigned long *)guest_regs)[access.reg];
1047
1048         result = ioapic_access_handler(cpu_data->cell, is_write, phys_addr,
1049                                        &val);
1050         if (result == 0)
1051                 result = pci_mmio_access_handler(cpu_data->cell, is_write,
1052                                                  phys_addr, &val);
1053
1054         if (result == 1) {
1055                 if (!is_write)
1056                         ((unsigned long *)guest_regs)[access.reg] = val;
1057                 vmx_skip_emulated_instruction(
1058                                 vmcs_read64(VM_EXIT_INSTRUCTION_LEN));
1059                 return true;
1060         }
1061
1062 invalid_access:
1063         /* report only unhandled access failures */
1064         if (result == 0)
1065                 panic_printk("FATAL: Invalid MMIO/RAM %s, addr: %p\n",
1066                              is_write ? "write" : "read", phys_addr);
1067         return false;
1068 }
1069
1070 void vmx_handle_exit(struct registers *guest_regs, struct per_cpu *cpu_data)
1071 {
1072         u32 reason = vmcs_read32(VM_EXIT_REASON);
1073         int sipi_vector;
1074
1075         cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_TOTAL]++;
1076
1077         switch (reason) {
1078         case EXIT_REASON_EXCEPTION_NMI:
1079                 asm volatile("int %0" : : "i" (NMI_VECTOR));
1080                 /* fall through */
1081         case EXIT_REASON_PREEMPTION_TIMER:
1082                 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_MANAGEMENT]++;
1083                 vmx_disable_preemption_timer();
1084                 sipi_vector = x86_handle_events(cpu_data);
1085                 if (sipi_vector >= 0) {
1086                         printk("CPU %d received SIPI, vector %x\n",
1087                                cpu_data->cpu_id, sipi_vector);
1088                         vmx_cpu_reset(cpu_data, sipi_vector);
1089                         memset(guest_regs, 0, sizeof(*guest_regs));
1090                 }
1091                 vtd_check_pending_faults(cpu_data);
1092                 return;
1093         case EXIT_REASON_CPUID:
1094                 vmx_skip_emulated_instruction(X86_INST_LEN_CPUID);
1095                 guest_regs->rax &= 0xffffffff;
1096                 guest_regs->rbx &= 0xffffffff;
1097                 guest_regs->rcx &= 0xffffffff;
1098                 guest_regs->rdx &= 0xffffffff;
1099                 __cpuid((u32 *)&guest_regs->rax, (u32 *)&guest_regs->rbx,
1100                         (u32 *)&guest_regs->rcx, (u32 *)&guest_regs->rdx);
1101                 return;
1102         case EXIT_REASON_VMCALL:
1103                 vmx_handle_hypercall(guest_regs, cpu_data);
1104                 return;
1105         case EXIT_REASON_CR_ACCESS:
1106                 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_CR]++;
1107                 if (vmx_handle_cr(guest_regs, cpu_data))
1108                         return;
1109                 break;
1110         case EXIT_REASON_MSR_READ:
1111                 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_MSR]++;
1112                 if (guest_regs->rcx >= MSR_X2APIC_BASE &&
1113                     guest_regs->rcx <= MSR_X2APIC_END) {
1114                         vmx_skip_emulated_instruction(X86_INST_LEN_RDMSR);
1115                         x2apic_handle_read(guest_regs);
1116                         return;
1117                 }
1118                 panic_printk("FATAL: Unhandled MSR read: %08x\n",
1119                              guest_regs->rcx);
1120                 break;
1121         case EXIT_REASON_MSR_WRITE:
1122                 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_MSR]++;
1123                 if (guest_regs->rcx == MSR_X2APIC_ICR) {
1124                         if (!apic_handle_icr_write(cpu_data, guest_regs->rax,
1125                                                    guest_regs->rdx))
1126                                 break;
1127                         vmx_skip_emulated_instruction(X86_INST_LEN_WRMSR);
1128                         return;
1129                 }
1130                 if (guest_regs->rcx >= MSR_X2APIC_BASE &&
1131                     guest_regs->rcx <= MSR_X2APIC_END) {
1132                         x2apic_handle_write(guest_regs);
1133                         vmx_skip_emulated_instruction(X86_INST_LEN_WRMSR);
1134                         return;
1135                 }
1136                 panic_printk("FATAL: Unhandled MSR write: %08x\n",
1137                              guest_regs->rcx);
1138                 break;
1139         case EXIT_REASON_APIC_ACCESS:
1140                 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_XAPIC]++;
1141                 if (vmx_handle_apic_access(guest_regs, cpu_data))
1142                         return;
1143                 break;
1144         case EXIT_REASON_XSETBV:
1145                 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_XSETBV]++;
1146                 if (guest_regs->rax & X86_XCR0_FP &&
1147                     (guest_regs->rax & ~cpuid_eax(0x0d)) == 0 &&
1148                     guest_regs->rcx == 0 && guest_regs->rdx == 0) {
1149                         vmx_skip_emulated_instruction(X86_INST_LEN_XSETBV);
1150                         asm volatile(
1151                                 "xsetbv"
1152                                 : /* no output */
1153                                 : "a" (guest_regs->rax), "c" (0), "d" (0));
1154                         return;
1155                 }
1156                 panic_printk("FATAL: Invalid xsetbv parameters: "
1157                              "xcr[%d] = %08x:%08x\n", guest_regs->rcx,
1158                              guest_regs->rdx, guest_regs->rax);
1159                 break;
1160         case EXIT_REASON_IO_INSTRUCTION:
1161                 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_PIO]++;
1162                 if (vmx_handle_io_access(guest_regs, cpu_data))
1163                         return;
1164                 break;
1165         case EXIT_REASON_EPT_VIOLATION:
1166                 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_MMIO]++;
1167                 if (vmx_handle_ept_violation(guest_regs, cpu_data))
1168                         return;
1169                 break;
1170         default:
1171                 panic_printk("FATAL: %s, reason %d\n",
1172                              (reason & EXIT_REASONS_FAILED_VMENTRY) ?
1173                              "VM-Entry failure" : "Unhandled VM-Exit",
1174                              (u16)reason);
1175                 dump_vm_exit_details(reason);
1176                 break;
1177         }
1178         dump_guest_regs(guest_regs);
1179         panic_halt(cpu_data);
1180 }
1181
1182 void vmx_entry_failure(struct per_cpu *cpu_data)
1183 {
1184         panic_printk("FATAL: vmresume failed, error %d\n",
1185                      vmcs_read32(VM_INSTRUCTION_ERROR));
1186         panic_stop(cpu_data);
1187 }