]> rtime.felk.cvut.cz Git - jailhouse.git/blob - hypervisor/arch/x86/vmx.c
x86: Provide PM timer access to all cells
[jailhouse.git] / hypervisor / arch / x86 / vmx.c
1 /*
2  * Jailhouse, a Linux-based partitioning hypervisor
3  *
4  * Copyright (c) Siemens AG, 2013
5  *
6  * Authors:
7  *  Jan Kiszka <jan.kiszka@siemens.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2.  See
10  * the COPYING file in the top-level directory.
11  */
12
13 #include <jailhouse/entry.h>
14 #include <jailhouse/paging.h>
15 #include <jailhouse/processor.h>
16 #include <jailhouse/printk.h>
17 #include <jailhouse/string.h>
18 #include <jailhouse/control.h>
19 #include <jailhouse/hypercall.h>
20 #include <jailhouse/mmio.h>
21 #include <jailhouse/pci.h>
22 #include <asm/apic.h>
23 #include <asm/control.h>
24 #include <asm/io.h>
25 #include <asm/ioapic.h>
26 #include <asm/pci.h>
27 #include <asm/vmx.h>
28 #include <asm/vtd.h>
29
30 static const struct segment invalid_seg = {
31         .access_rights = 0x10000
32 };
33
34 static u8 __attribute__((aligned(PAGE_SIZE))) msr_bitmap[][0x2000/8] = {
35         [ VMX_MSR_BMP_0000_READ ] = {
36                 [      0/8 ...  0x7ff/8 ] = 0,
37                 [  0x800/8 ...  0x807/8 ] = 0x0c, /* 0x802, 0x803 */
38                 [  0x808/8 ...  0x80f/8 ] = 0xa5, /* 0x808, 0x80a, 0x80d */
39                 [  0x810/8 ...  0x817/8 ] = 0xff, /* 0x810 - 0x817 */
40                 [  0x818/8 ...  0x81f/8 ] = 0xff, /* 0x818 - 0x81f */
41                 [  0x820/8 ...  0x827/8 ] = 0xff, /* 0x820 - 0x827 */
42                 [  0x828/8 ...  0x82f/8 ] = 0x81, /* 0x828, 0x82f */
43                 [  0x830/8 ...  0x837/8 ] = 0xfd, /* 0x830, 0x832 - 0x837 */
44                 [  0x838/8 ...  0x83f/8 ] = 0x43, /* 0x838, 0x839, 0x83e */
45                 [  0x840/8 ... 0x1fff/8 ] = 0,
46         },
47         [ VMX_MSR_BMP_C000_READ ] = {
48                 [      0/8 ... 0x1fff/8 ] = 0,
49         },
50         [ VMX_MSR_BMP_0000_WRITE ] = {
51                 [      0/8 ...  0x807/8 ] = 0,
52                 [  0x808/8 ...  0x80f/8 ] = 0x89, /* 0x808, 0x80b, 0x80f */
53                 [  0x810/8 ...  0x827/8 ] = 0,
54                 [  0x828/8 ...  0x82f/8 ] = 0x81, /* 0x828, 0x82f */
55                 [  0x830/8 ...  0x837/8 ] = 0xfd, /* 0x830, 0x832 - 0x837 */
56                 [  0x838/8 ...  0x83f/8 ] = 0xc1, /* 0x838, 0x83e, 0x83f */
57                 [  0x840/8 ... 0x1fff/8 ] = 0,
58         },
59         [ VMX_MSR_BMP_C000_WRITE ] = {
60                 [      0/8 ... 0x1fff/8 ] = 0,
61         },
62 };
63 static u8 __attribute__((aligned(PAGE_SIZE))) apic_access_page[PAGE_SIZE];
64 static struct paging ept_paging[EPT_PAGE_DIR_LEVELS];
65 static u32 enable_rdtscp;
66
67 static bool vmxon(struct per_cpu *cpu_data)
68 {
69         unsigned long vmxon_addr;
70         u8 ok;
71
72         vmxon_addr = page_map_hvirt2phys(&cpu_data->vmxon_region);
73         asm volatile(
74                 "vmxon (%1)\n\t"
75                 "seta %0"
76                 : "=rm" (ok)
77                 : "r" (&vmxon_addr), "m" (vmxon_addr)
78                 : "memory", "cc");
79         return ok;
80 }
81
82 static bool vmcs_clear(struct per_cpu *cpu_data)
83 {
84         unsigned long vmcs_addr = page_map_hvirt2phys(&cpu_data->vmcs);
85         u8 ok;
86
87         asm volatile(
88                 "vmclear (%1)\n\t"
89                 "seta %0"
90                 : "=qm" (ok)
91                 : "r" (&vmcs_addr), "m" (vmcs_addr)
92                 : "memory", "cc");
93         return ok;
94 }
95
96 static bool vmcs_load(struct per_cpu *cpu_data)
97 {
98         unsigned long vmcs_addr = page_map_hvirt2phys(&cpu_data->vmcs);
99         u8 ok;
100
101         asm volatile(
102                 "vmptrld (%1)\n\t"
103                 "seta %0"
104                 : "=qm" (ok)
105                 : "r" (&vmcs_addr), "m" (vmcs_addr)
106                 : "memory", "cc");
107         return ok;
108 }
109
110 static inline unsigned long vmcs_read64(unsigned long field)
111 {
112         unsigned long value;
113
114         asm volatile("vmread %1,%0" : "=r" (value) : "r" (field) : "cc");
115         return value;
116 }
117
118 static inline u16 vmcs_read16(unsigned long field)
119 {
120         return vmcs_read64(field);
121 }
122
123 static inline u32 vmcs_read32(unsigned long field)
124 {
125         return vmcs_read64(field);
126 }
127
128 static bool vmcs_write64(unsigned long field, unsigned long val)
129 {
130         u8 ok;
131
132         asm volatile(
133                 "vmwrite %1,%2\n\t"
134                 "setnz %0"
135                 : "=qm" (ok)
136                 : "r" (val), "r" (field)
137                 : "cc");
138         if (!ok)
139                 printk("FATAL: vmwrite %08lx failed, error %d, caller %p\n",
140                        field, vmcs_read32(VM_INSTRUCTION_ERROR),
141                        __builtin_return_address(0));
142         return ok;
143 }
144
145 static bool vmcs_write16(unsigned long field, u16 value)
146 {
147         return vmcs_write64(field, value);
148 }
149
150 static bool vmcs_write32(unsigned long field, u32 value)
151 {
152         return vmcs_write64(field, value);
153 }
154
155 static int vmx_check_features(void)
156 {
157         unsigned long vmx_proc_ctrl, vmx_proc_ctrl2, ept_cap;
158         unsigned long vmx_pin_ctrl, vmx_basic;
159
160         if (!(cpuid_ecx(1) & X86_FEATURE_VMX))
161                 return -ENODEV;
162
163         vmx_basic = read_msr(MSR_IA32_VMX_BASIC);
164
165         /* require VMCS size <= PAGE_SIZE,
166          * VMCS memory access type == write back and
167          * availability of TRUE_*_CTLS */
168         if (((vmx_basic >> 32) & 0x1fff) > PAGE_SIZE ||
169             ((vmx_basic >> 50) & 0xf) != EPT_TYPE_WRITEBACK ||
170             !(vmx_basic & (1UL << 55)))
171                 return -EIO;
172
173         /* require NMI exiting and preemption timer support */
174         vmx_pin_ctrl = read_msr(MSR_IA32_VMX_PINBASED_CTLS) >> 32;
175         if (!(vmx_pin_ctrl & PIN_BASED_NMI_EXITING) ||
176             !(vmx_pin_ctrl & PIN_BASED_VMX_PREEMPTION_TIMER))
177                 return -EIO;
178
179         /* require I/O and MSR bitmap as well as secondary controls support */
180         vmx_proc_ctrl = read_msr(MSR_IA32_VMX_PROCBASED_CTLS) >> 32;
181         if (!(vmx_proc_ctrl & CPU_BASED_USE_IO_BITMAPS) ||
182             !(vmx_proc_ctrl & CPU_BASED_USE_MSR_BITMAPS) ||
183             !(vmx_proc_ctrl & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
184                 return -EIO;
185
186         /* require disabling of CR3 access interception */
187         vmx_proc_ctrl = read_msr(MSR_IA32_VMX_TRUE_PROCBASED_CTLS);
188         if (vmx_proc_ctrl &
189             (CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING))
190                 return -EIO;
191
192         /* require APIC access, EPT and unrestricted guest mode support */
193         vmx_proc_ctrl2 = read_msr(MSR_IA32_VMX_PROCBASED_CTLS2) >> 32;
194         ept_cap = read_msr(MSR_IA32_VMX_EPT_VPID_CAP);
195         if (!(vmx_proc_ctrl2 & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) ||
196             !(vmx_proc_ctrl2 & SECONDARY_EXEC_ENABLE_EPT) ||
197             (ept_cap & EPT_MANDATORY_FEATURES) != EPT_MANDATORY_FEATURES ||
198             !(ept_cap & (EPT_INVEPT_SINGLE | EPT_INVEPT_GLOBAL)) ||
199             !(vmx_proc_ctrl2 & SECONDARY_EXEC_UNRESTRICTED_GUEST))
200                 return -EIO;
201
202         /* require RDTSCP if present in CPUID */
203         if (cpuid_edx(0x80000001) & X86_FEATURE_RDTSCP) {
204                 enable_rdtscp = SECONDARY_EXEC_RDTSCP;
205                 if (!(vmx_proc_ctrl2 & SECONDARY_EXEC_RDTSCP))
206                         return -EIO;
207         }
208
209         /* require activity state HLT */
210         if (!(read_msr(MSR_IA32_VMX_MISC) & VMX_MISC_ACTIVITY_HLT))
211                 return -EIO;
212
213         return 0;
214 }
215
216 static void ept_set_next_pt(pt_entry_t pte, unsigned long next_pt)
217 {
218         *pte = (next_pt & 0x000ffffffffff000UL) | EPT_FLAG_READ |
219                 EPT_FLAG_WRITE | EPT_FLAG_EXECUTE;
220 }
221
222 int vmx_init(void)
223 {
224         unsigned int n;
225         int err;
226
227         err = vmx_check_features();
228         if (err)
229                 return err;
230
231         /* derive ept_paging from very similar x86_64_paging */
232         memcpy(ept_paging, x86_64_paging, sizeof(ept_paging));
233         for (n = 0; n < EPT_PAGE_DIR_LEVELS; n++)
234                 ept_paging[n].set_next_pt = ept_set_next_pt;
235         if (!(read_msr(MSR_IA32_VMX_EPT_VPID_CAP) & EPT_1G_PAGES))
236                 ept_paging[1].page_size = 0;
237         if (!(read_msr(MSR_IA32_VMX_EPT_VPID_CAP) & EPT_2M_PAGES))
238                 ept_paging[2].page_size = 0;
239
240         if (using_x2apic) {
241                 /* allow direct x2APIC access except for ICR writes */
242                 memset(&msr_bitmap[VMX_MSR_BMP_0000_READ][MSR_X2APIC_BASE/8],
243                        0, (MSR_X2APIC_END - MSR_X2APIC_BASE + 1)/8);
244                 memset(&msr_bitmap[VMX_MSR_BMP_0000_WRITE][MSR_X2APIC_BASE/8],
245                        0, (MSR_X2APIC_END - MSR_X2APIC_BASE + 1)/8);
246                 msr_bitmap[VMX_MSR_BMP_0000_WRITE][MSR_X2APIC_ICR/8] = 0x01;
247         }
248
249         return vmx_cell_init(&root_cell);
250 }
251
252 unsigned long arch_page_map_gphys2phys(struct per_cpu *cpu_data,
253                                        unsigned long gphys)
254 {
255         return page_map_virt2phys(&cpu_data->cell->vmx.ept_structs, gphys);
256 }
257
258 int vmx_cell_init(struct cell *cell)
259 {
260         const u8 *pio_bitmap = jailhouse_cell_pio_bitmap(cell->config);
261         u32 pio_bitmap_size = cell->config->pio_bitmap_size;
262         unsigned int n, pm_timer_addr;
263         u32 size;
264         int err;
265         u8 *b;
266
267         /* PM timer has to be provided */
268         if (system_config->platform_info.x86.pm_timer_address == 0)
269                 return -EINVAL;
270
271         /* build root EPT of cell */
272         cell->vmx.ept_structs.root_paging = ept_paging;
273         cell->vmx.ept_structs.root_table = page_alloc(&mem_pool, 1);
274         if (!cell->vmx.ept_structs.root_table)
275                 return -ENOMEM;
276
277         err = page_map_create(&cell->vmx.ept_structs,
278                               page_map_hvirt2phys(apic_access_page),
279                               PAGE_SIZE, XAPIC_BASE,
280                               EPT_FLAG_READ|EPT_FLAG_WRITE|EPT_FLAG_WB_TYPE,
281                               PAGE_MAP_NON_COHERENT);
282         if (err) {
283                 vmx_cell_exit(cell);
284                 return err;
285         }
286
287         memset(cell->vmx.io_bitmap, -1, sizeof(cell->vmx.io_bitmap));
288
289         for (n = 0; n < 2; n++) {
290                 size = pio_bitmap_size <= PAGE_SIZE ?
291                         pio_bitmap_size : PAGE_SIZE;
292                 memcpy(cell->vmx.io_bitmap + n * PAGE_SIZE, pio_bitmap, size);
293                 pio_bitmap += size;
294                 pio_bitmap_size -= size;
295         }
296
297         if (cell != &root_cell) {
298                 /*
299                  * Shrink PIO access of root cell corresponding to new cell's
300                  * access rights.
301                  */
302                 pio_bitmap = jailhouse_cell_pio_bitmap(cell->config);
303                 pio_bitmap_size = cell->config->pio_bitmap_size;
304                 for (b = root_cell.vmx.io_bitmap; pio_bitmap_size > 0;
305                      b++, pio_bitmap++, pio_bitmap_size--)
306                         *b |= ~*pio_bitmap;
307         }
308
309         /* permit access to the PM timer */
310         pm_timer_addr = system_config->platform_info.x86.pm_timer_address;
311         for (n = 0; n < 4; n++, pm_timer_addr++) {
312                 b = cell->vmx.io_bitmap;
313                 b[pm_timer_addr / 8] &= ~(1 << (pm_timer_addr % 8));
314         }
315
316         return 0;
317 }
318
319 int vmx_map_memory_region(struct cell *cell,
320                           const struct jailhouse_memory *mem)
321 {
322         u64 phys_start = mem->phys_start;
323         u32 flags = EPT_FLAG_WB_TYPE;
324
325         if (mem->flags & JAILHOUSE_MEM_READ)
326                 flags |= EPT_FLAG_READ;
327         if (mem->flags & JAILHOUSE_MEM_WRITE)
328                 flags |= EPT_FLAG_WRITE;
329         if (mem->flags & JAILHOUSE_MEM_EXECUTE)
330                 flags |= EPT_FLAG_EXECUTE;
331         if (mem->flags & JAILHOUSE_MEM_COMM_REGION)
332                 phys_start = page_map_hvirt2phys(&cell->comm_page);
333
334         return page_map_create(&cell->vmx.ept_structs, phys_start, mem->size,
335                                mem->virt_start, flags, PAGE_MAP_NON_COHERENT);
336 }
337
338 int vmx_unmap_memory_region(struct cell *cell,
339                             const struct jailhouse_memory *mem)
340 {
341         return page_map_destroy(&cell->vmx.ept_structs, mem->virt_start,
342                                 mem->size, PAGE_MAP_NON_COHERENT);
343 }
344
345 void vmx_cell_exit(struct cell *cell)
346 {
347         const u8 *root_pio_bitmap =
348                 jailhouse_cell_pio_bitmap(root_cell.config);
349         const u8 *pio_bitmap = jailhouse_cell_pio_bitmap(cell->config);
350         u32 pio_bitmap_size = cell->config->pio_bitmap_size;
351         u8 *b;
352
353         page_map_destroy(&cell->vmx.ept_structs, XAPIC_BASE, PAGE_SIZE,
354                          PAGE_MAP_NON_COHERENT);
355
356         if (root_cell.config->pio_bitmap_size < pio_bitmap_size)
357                 pio_bitmap_size = root_cell.config->pio_bitmap_size;
358
359         for (b = root_cell.vmx.io_bitmap; pio_bitmap_size > 0;
360              b++, pio_bitmap++, root_pio_bitmap++, pio_bitmap_size--)
361                 *b &= *pio_bitmap | *root_pio_bitmap;
362
363         page_free(&mem_pool, cell->vmx.ept_structs.root_table, 1);
364 }
365
366 void vmx_invept(void)
367 {
368         unsigned long ept_cap = read_msr(MSR_IA32_VMX_EPT_VPID_CAP);
369         struct {
370                 u64 eptp;
371                 u64 reserved;
372         } descriptor;
373         u64 type;
374         u8 ok;
375
376         descriptor.reserved = 0;
377         if (ept_cap & EPT_INVEPT_SINGLE) {
378                 type = VMX_INVEPT_SINGLE;
379                 descriptor.eptp = vmcs_read64(EPT_POINTER);
380         } else {
381                 type = VMX_INVEPT_GLOBAL;
382                 descriptor.eptp = 0;
383         }
384         asm volatile(
385                 "invept (%1),%2\n\t"
386                 "seta %0\n\t"
387                 : "=qm" (ok)
388                 : "r" (&descriptor), "r" (type)
389                 : "memory", "cc");
390
391         if (!ok) {
392                 panic_printk("FATAL: invept failed, error %d\n",
393                              vmcs_read32(VM_INSTRUCTION_ERROR));
394                 panic_stop(NULL);
395         }
396 }
397
398 static bool vmx_set_guest_cr(int cr, unsigned long val)
399 {
400         unsigned long fixed0, fixed1, required1;
401         bool ok = true;
402
403         fixed0 = read_msr(cr ? MSR_IA32_VMX_CR4_FIXED0
404                              : MSR_IA32_VMX_CR0_FIXED0);
405         fixed1 = read_msr(cr ? MSR_IA32_VMX_CR4_FIXED1
406                              : MSR_IA32_VMX_CR0_FIXED1);
407         required1 = fixed0 & fixed1;
408         if (cr == 0) {
409                 fixed1 &= ~(X86_CR0_NW | X86_CR0_CD);
410                 required1 &= ~(X86_CR0_PE | X86_CR0_PG);
411                 required1 |= X86_CR0_ET;
412         } else {
413                 /* keeps the hypervisor visible */
414                 val |= X86_CR4_VMXE;
415         }
416         ok &= vmcs_write64(cr ? GUEST_CR4 : GUEST_CR0,
417                            (val & fixed1) | required1);
418         ok &= vmcs_write64(cr ? CR4_READ_SHADOW : CR0_READ_SHADOW, val);
419         ok &= vmcs_write64(cr ? CR4_GUEST_HOST_MASK : CR0_GUEST_HOST_MASK,
420                            required1 | ~fixed1);
421
422         return ok;
423 }
424
425 static bool vmx_set_cell_config(struct cell *cell)
426 {
427         u8 *io_bitmap;
428         bool ok = true;
429
430         io_bitmap = cell->vmx.io_bitmap;
431         ok &= vmcs_write64(IO_BITMAP_A, page_map_hvirt2phys(io_bitmap));
432         ok &= vmcs_write64(IO_BITMAP_B,
433                            page_map_hvirt2phys(io_bitmap + PAGE_SIZE));
434
435         ok &= vmcs_write64(EPT_POINTER,
436                         page_map_hvirt2phys(cell->vmx.ept_structs.root_table) |
437                         EPT_TYPE_WRITEBACK | EPT_PAGE_WALK_LEN);
438
439         return ok;
440 }
441
442 static bool vmx_set_guest_segment(const struct segment *seg,
443                                   unsigned long selector_field)
444 {
445         bool ok = true;
446
447         ok &= vmcs_write16(selector_field, seg->selector);
448         ok &= vmcs_write64(selector_field + GUEST_SEG_BASE, seg->base);
449         ok &= vmcs_write32(selector_field + GUEST_SEG_LIMIT, seg->limit);
450         ok &= vmcs_write32(selector_field + GUEST_SEG_AR_BYTES,
451                            seg->access_rights);
452         return ok;
453 }
454
455 static bool vmcs_setup(struct per_cpu *cpu_data)
456 {
457         struct desc_table_reg dtr;
458         unsigned long val;
459         bool ok = true;
460
461         ok &= vmcs_write64(HOST_CR0, read_cr0());
462         ok &= vmcs_write64(HOST_CR3, read_cr3());
463         ok &= vmcs_write64(HOST_CR4, read_cr4());
464
465         ok &= vmcs_write16(HOST_CS_SELECTOR, GDT_DESC_CODE * 8);
466         ok &= vmcs_write16(HOST_DS_SELECTOR, 0);
467         ok &= vmcs_write16(HOST_ES_SELECTOR, 0);
468         ok &= vmcs_write16(HOST_SS_SELECTOR, 0);
469         ok &= vmcs_write16(HOST_FS_SELECTOR, 0);
470         ok &= vmcs_write16(HOST_GS_SELECTOR, 0);
471         ok &= vmcs_write16(HOST_TR_SELECTOR, GDT_DESC_TSS * 8);
472
473         ok &= vmcs_write64(HOST_FS_BASE, 0);
474         ok &= vmcs_write64(HOST_GS_BASE, 0);
475         ok &= vmcs_write64(HOST_TR_BASE, 0);
476
477         read_gdtr(&dtr);
478         ok &= vmcs_write64(HOST_GDTR_BASE, dtr.base);
479         read_idtr(&dtr);
480         ok &= vmcs_write64(HOST_IDTR_BASE, dtr.base);
481
482         ok &= vmcs_write64(HOST_IA32_EFER, EFER_LMA | EFER_LME);
483
484         ok &= vmcs_write32(HOST_IA32_SYSENTER_CS, 0);
485         ok &= vmcs_write64(HOST_IA32_SYSENTER_EIP, 0);
486         ok &= vmcs_write64(HOST_IA32_SYSENTER_ESP, 0);
487
488         ok &= vmcs_write64(HOST_RSP, (unsigned long)cpu_data->stack +
489                            sizeof(cpu_data->stack));
490         ok &= vmcs_write64(HOST_RIP, (unsigned long)vm_exit);
491
492         ok &= vmx_set_guest_cr(0, read_cr0());
493         ok &= vmx_set_guest_cr(4, read_cr4());
494
495         ok &= vmcs_write64(GUEST_CR3, cpu_data->linux_cr3);
496
497         ok &= vmx_set_guest_segment(&cpu_data->linux_cs, GUEST_CS_SELECTOR);
498         ok &= vmx_set_guest_segment(&cpu_data->linux_ds, GUEST_DS_SELECTOR);
499         ok &= vmx_set_guest_segment(&cpu_data->linux_es, GUEST_ES_SELECTOR);
500         ok &= vmx_set_guest_segment(&cpu_data->linux_fs, GUEST_FS_SELECTOR);
501         ok &= vmx_set_guest_segment(&cpu_data->linux_gs, GUEST_GS_SELECTOR);
502         ok &= vmx_set_guest_segment(&invalid_seg, GUEST_SS_SELECTOR);
503         ok &= vmx_set_guest_segment(&cpu_data->linux_tss, GUEST_TR_SELECTOR);
504         ok &= vmx_set_guest_segment(&invalid_seg, GUEST_LDTR_SELECTOR);
505
506         ok &= vmcs_write64(GUEST_GDTR_BASE, cpu_data->linux_gdtr.base);
507         ok &= vmcs_write32(GUEST_GDTR_LIMIT, cpu_data->linux_gdtr.limit);
508         ok &= vmcs_write64(GUEST_IDTR_BASE, cpu_data->linux_idtr.base);
509         ok &= vmcs_write32(GUEST_IDTR_LIMIT, cpu_data->linux_idtr.limit);
510
511         ok &= vmcs_write64(GUEST_RFLAGS, 0x02);
512         ok &= vmcs_write64(GUEST_RSP, cpu_data->linux_sp +
513                            (NUM_ENTRY_REGS + 1) * sizeof(unsigned long));
514         ok &= vmcs_write64(GUEST_RIP, cpu_data->linux_ip);
515
516         ok &= vmcs_write32(GUEST_SYSENTER_CS,
517                            read_msr(MSR_IA32_SYSENTER_CS));
518         ok &= vmcs_write64(GUEST_SYSENTER_EIP,
519                            read_msr(MSR_IA32_SYSENTER_EIP));
520         ok &= vmcs_write64(GUEST_SYSENTER_ESP,
521                            read_msr(MSR_IA32_SYSENTER_ESP));
522
523         ok &= vmcs_write64(GUEST_DR7, 0x00000400);
524         ok &= vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
525
526         ok &= vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
527         ok &= vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
528         ok &= vmcs_write64(GUEST_PENDING_DBG_EXCEPTIONS, 0);
529
530         ok &= vmcs_write64(GUEST_IA32_EFER, cpu_data->linux_efer);
531
532         // TODO: switch PAT, PERF */
533
534         ok &= vmcs_write64(VMCS_LINK_POINTER, -1UL);
535         ok &= vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
536
537         val = read_msr(MSR_IA32_VMX_PINBASED_CTLS);
538         val |= PIN_BASED_NMI_EXITING;
539         ok &= vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, val);
540
541         ok &= vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, 0);
542
543         val = read_msr(MSR_IA32_VMX_PROCBASED_CTLS);
544         val |= CPU_BASED_USE_IO_BITMAPS | CPU_BASED_USE_MSR_BITMAPS |
545                 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
546         val &= ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING);
547         ok &= vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, val);
548
549         ok &= vmcs_write64(MSR_BITMAP, page_map_hvirt2phys(msr_bitmap));
550
551         val = read_msr(MSR_IA32_VMX_PROCBASED_CTLS2);
552         val |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
553                 SECONDARY_EXEC_ENABLE_EPT | SECONDARY_EXEC_UNRESTRICTED_GUEST |
554                 enable_rdtscp;
555         ok &= vmcs_write32(SECONDARY_VM_EXEC_CONTROL, val);
556
557         ok &= vmcs_write64(APIC_ACCESS_ADDR,
558                            page_map_hvirt2phys(apic_access_page));
559
560         ok &= vmx_set_cell_config(cpu_data->cell);
561
562         ok &= vmcs_write32(EXCEPTION_BITMAP, 0);
563
564         val = read_msr(MSR_IA32_VMX_EXIT_CTLS);
565         val |= VM_EXIT_HOST_ADDR_SPACE_SIZE | VM_EXIT_SAVE_IA32_EFER |
566                 VM_EXIT_LOAD_IA32_EFER;
567         ok &= vmcs_write32(VM_EXIT_CONTROLS, val);
568
569         ok &= vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
570         ok &= vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0);
571         ok &= vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0);
572
573         val = read_msr(MSR_IA32_VMX_ENTRY_CTLS);
574         val |= VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER;
575         ok &= vmcs_write32(VM_ENTRY_CONTROLS, val);
576
577         ok &= vmcs_write64(CR4_GUEST_HOST_MASK, 0);
578
579         ok &= vmcs_write32(CR3_TARGET_COUNT, 0);
580
581         return ok;
582 }
583
584 int vmx_cpu_init(struct per_cpu *cpu_data)
585 {
586         unsigned long cr4, feature_ctrl, mask;
587         u32 revision_id;
588         int err;
589
590         cr4 = read_cr4();
591         if (cr4 & X86_CR4_VMXE)
592                 return -EBUSY;
593
594         err = vmx_check_features();
595         if (err)
596                 return err;
597
598         revision_id = (u32)read_msr(MSR_IA32_VMX_BASIC);
599         cpu_data->vmxon_region.revision_id = revision_id;
600         cpu_data->vmxon_region.shadow_indicator = 0;
601         cpu_data->vmcs.revision_id = revision_id;
602         cpu_data->vmcs.shadow_indicator = 0;
603
604         // TODO: validate CR0
605
606         /* Note: We assume that TXT is off */
607         feature_ctrl = read_msr(MSR_IA32_FEATURE_CONTROL);
608         mask = FEATURE_CONTROL_LOCKED |
609                 FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
610
611         if ((feature_ctrl & mask) != mask) {
612                 if (feature_ctrl & FEATURE_CONTROL_LOCKED)
613                         return -ENODEV;
614
615                 feature_ctrl |= mask;
616                 write_msr(MSR_IA32_FEATURE_CONTROL, feature_ctrl);
617         }
618
619         write_cr4(cr4 | X86_CR4_VMXE);
620         // TODO: validate CR4
621
622         if (!vmxon(cpu_data))  {
623                 write_cr4(cr4);
624                 return -EIO;
625         }
626
627         cpu_data->vmx_state = VMXON;
628
629         if (!vmcs_clear(cpu_data) ||
630             !vmcs_load(cpu_data) ||
631             !vmcs_setup(cpu_data))
632                 return -EIO;
633
634         cpu_data->vmx_state = VMCS_READY;
635
636         return 0;
637 }
638
639 void vmx_cpu_exit(struct per_cpu *cpu_data)
640 {
641         if (cpu_data->vmx_state == VMXOFF)
642                 return;
643
644         cpu_data->vmx_state = VMXOFF;
645         /* Write vmx_state to ensure that vmx_schedule_vmexit stops accessing
646          * the VMCS (a compiler barrier would be sufficient, in fact). */
647         memory_barrier();
648
649         vmcs_clear(cpu_data);
650         asm volatile("vmxoff" : : : "cc");
651         write_cr4(read_cr4() & ~X86_CR4_VMXE);
652 }
653
654 void vmx_cpu_activate_vmm(struct per_cpu *cpu_data)
655 {
656         /* We enter Linux at the point arch_entry would return to as well.
657          * rax is cleared to signal success to the caller. */
658         asm volatile(
659                 "mov (%%rdi),%%r15\n\t"
660                 "mov 0x8(%%rdi),%%r14\n\t"
661                 "mov 0x10(%%rdi),%%r13\n\t"
662                 "mov 0x18(%%rdi),%%r12\n\t"
663                 "mov 0x20(%%rdi),%%rbx\n\t"
664                 "mov 0x28(%%rdi),%%rbp\n\t"
665                 "vmlaunch\n\t"
666                 "pop %%rbp"
667                 : /* no output */
668                 : "a" (0), "D" (cpu_data->linux_reg)
669                 : "memory", "r15", "r14", "r13", "r12", "rbx", "rbp", "cc");
670
671         panic_printk("FATAL: vmlaunch failed, error %d\n",
672                      vmcs_read32(VM_INSTRUCTION_ERROR));
673         panic_stop(cpu_data);
674 }
675
676 static void __attribute__((noreturn))
677 vmx_cpu_deactivate_vmm(struct registers *guest_regs, struct per_cpu *cpu_data)
678 {
679         unsigned long *stack = (unsigned long *)vmcs_read64(GUEST_RSP);
680         unsigned long linux_ip = vmcs_read64(GUEST_RIP);
681
682         cpu_data->linux_cr3 = vmcs_read64(GUEST_CR3);
683
684         cpu_data->linux_gdtr.base = vmcs_read64(GUEST_GDTR_BASE);
685         cpu_data->linux_gdtr.limit = vmcs_read64(GUEST_GDTR_LIMIT);
686         cpu_data->linux_idtr.base = vmcs_read64(GUEST_IDTR_BASE);
687         cpu_data->linux_idtr.limit = vmcs_read64(GUEST_IDTR_LIMIT);
688
689         cpu_data->linux_cs.selector = vmcs_read32(GUEST_CS_SELECTOR);
690
691         cpu_data->linux_tss.selector = vmcs_read32(GUEST_TR_SELECTOR);
692
693         cpu_data->linux_efer = vmcs_read64(GUEST_IA32_EFER);
694         cpu_data->linux_fs.base = vmcs_read64(GUEST_FS_BASE);
695         cpu_data->linux_gs.base = vmcs_read64(GUEST_GS_BASE);
696
697         cpu_data->linux_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
698         cpu_data->linux_sysenter_eip = vmcs_read64(GUEST_SYSENTER_EIP);
699         cpu_data->linux_sysenter_esp = vmcs_read64(GUEST_SYSENTER_ESP);
700
701         cpu_data->linux_ds.selector = vmcs_read16(GUEST_DS_SELECTOR);
702         cpu_data->linux_es.selector = vmcs_read16(GUEST_ES_SELECTOR);
703         cpu_data->linux_fs.selector = vmcs_read16(GUEST_FS_SELECTOR);
704         cpu_data->linux_gs.selector = vmcs_read16(GUEST_GS_SELECTOR);
705
706         arch_cpu_restore(cpu_data);
707
708         stack--;
709         *stack = linux_ip;
710
711         asm volatile (
712                 "mov %%rbx,%%rsp\n\t"
713                 "pop %%r15\n\t"
714                 "pop %%r14\n\t"
715                 "pop %%r13\n\t"
716                 "pop %%r12\n\t"
717                 "pop %%r11\n\t"
718                 "pop %%r10\n\t"
719                 "pop %%r9\n\t"
720                 "pop %%r8\n\t"
721                 "pop %%rdi\n\t"
722                 "pop %%rsi\n\t"
723                 "pop %%rbp\n\t"
724                 "add $8,%%rsp\n\t"
725                 "pop %%rbx\n\t"
726                 "pop %%rdx\n\t"
727                 "pop %%rcx\n\t"
728                 "mov %%rax,%%rsp\n\t"
729                 "xor %%rax,%%rax\n\t"
730                 "ret"
731                 : : "a" (stack), "b" (guest_regs));
732         __builtin_unreachable();
733 }
734
735 static void vmx_cpu_reset(struct per_cpu *cpu_data, unsigned int sipi_vector)
736 {
737         unsigned long val;
738         bool ok = true;
739
740         ok &= vmx_set_guest_cr(0, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
741         ok &= vmx_set_guest_cr(4, 0);
742
743         ok &= vmcs_write64(GUEST_CR3, 0);
744
745         ok &= vmcs_write64(GUEST_RFLAGS, 0x02);
746         ok &= vmcs_write64(GUEST_RSP, 0);
747
748         val = 0;
749         if (sipi_vector == APIC_BSP_PSEUDO_SIPI) {
750                 val = 0xfff0;
751                 sipi_vector = 0xf0;
752         }
753         ok &= vmcs_write64(GUEST_RIP, val);
754
755         ok &= vmcs_write16(GUEST_CS_SELECTOR, sipi_vector << 8);
756         ok &= vmcs_write64(GUEST_CS_BASE, sipi_vector << 12);
757         ok &= vmcs_write32(GUEST_CS_LIMIT, 0xffff);
758         ok &= vmcs_write32(GUEST_CS_AR_BYTES, 0x0009b);
759
760         ok &= vmcs_write16(GUEST_DS_SELECTOR, 0);
761         ok &= vmcs_write64(GUEST_DS_BASE, 0);
762         ok &= vmcs_write32(GUEST_DS_LIMIT, 0xffff);
763         ok &= vmcs_write32(GUEST_DS_AR_BYTES, 0x00093);
764
765         ok &= vmcs_write16(GUEST_ES_SELECTOR, 0);
766         ok &= vmcs_write64(GUEST_ES_BASE, 0);
767         ok &= vmcs_write32(GUEST_ES_LIMIT, 0xffff);
768         ok &= vmcs_write32(GUEST_ES_AR_BYTES, 0x00093);
769
770         ok &= vmcs_write16(GUEST_FS_SELECTOR, 0);
771         ok &= vmcs_write64(GUEST_FS_BASE, 0);
772         ok &= vmcs_write32(GUEST_FS_LIMIT, 0xffff);
773         ok &= vmcs_write32(GUEST_FS_AR_BYTES, 0x00093);
774
775         ok &= vmcs_write16(GUEST_GS_SELECTOR, 0);
776         ok &= vmcs_write64(GUEST_GS_BASE, 0);
777         ok &= vmcs_write32(GUEST_GS_LIMIT, 0xffff);
778         ok &= vmcs_write32(GUEST_GS_AR_BYTES, 0x00093);
779
780         ok &= vmcs_write16(GUEST_SS_SELECTOR, 0);
781         ok &= vmcs_write64(GUEST_SS_BASE, 0);
782         ok &= vmcs_write32(GUEST_SS_LIMIT, 0xffff);
783         ok &= vmcs_write32(GUEST_SS_AR_BYTES, 0x00093);
784
785         ok &= vmcs_write16(GUEST_TR_SELECTOR, 0);
786         ok &= vmcs_write64(GUEST_TR_BASE, 0);
787         ok &= vmcs_write32(GUEST_TR_LIMIT, 0xffff);
788         ok &= vmcs_write32(GUEST_TR_AR_BYTES, 0x0008b);
789
790         ok &= vmcs_write16(GUEST_LDTR_SELECTOR, 0);
791         ok &= vmcs_write64(GUEST_LDTR_BASE, 0);
792         ok &= vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
793         ok &= vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
794
795         ok &= vmcs_write64(GUEST_GDTR_BASE, 0);
796         ok &= vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
797         ok &= vmcs_write64(GUEST_IDTR_BASE, 0);
798         ok &= vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
799
800         ok &= vmcs_write64(GUEST_IA32_EFER, 0);
801
802         ok &= vmcs_write32(GUEST_SYSENTER_CS, 0);
803         ok &= vmcs_write64(GUEST_SYSENTER_EIP, 0);
804         ok &= vmcs_write64(GUEST_SYSENTER_ESP, 0);
805
806         ok &= vmcs_write64(GUEST_DR7, 0x00000400);
807         ok &= vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
808
809         ok &= vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
810         ok &= vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
811         ok &= vmcs_write64(GUEST_PENDING_DBG_EXCEPTIONS, 0);
812
813         val = vmcs_read32(VM_ENTRY_CONTROLS);
814         val &= ~VM_ENTRY_IA32E_MODE;
815         ok &= vmcs_write32(VM_ENTRY_CONTROLS, val);
816
817         ok &= vmx_set_cell_config(cpu_data->cell);
818
819         if (!ok) {
820                 panic_printk("FATAL: CPU reset failed\n");
821                 panic_stop(cpu_data);
822         }
823 }
824
825 void vmx_schedule_vmexit(struct per_cpu *cpu_data)
826 {
827         u32 pin_based_ctrl;
828
829         if (cpu_data->vmx_state != VMCS_READY)
830                 return;
831
832         pin_based_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
833         pin_based_ctrl |= PIN_BASED_VMX_PREEMPTION_TIMER;
834         vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, pin_based_ctrl);
835 }
836
837 void vmx_cpu_park(struct per_cpu *cpu_data)
838 {
839         vmx_cpu_reset(cpu_data, 0);
840         vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_HLT);
841 }
842
843 static void vmx_disable_preemption_timer(void)
844 {
845         u32 pin_based_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
846
847         pin_based_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
848         vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, pin_based_ctrl);
849 }
850
851 static void vmx_skip_emulated_instruction(unsigned int inst_len)
852 {
853         vmcs_write64(GUEST_RIP, vmcs_read64(GUEST_RIP) + inst_len);
854 }
855
856 static void update_efer(void)
857 {
858         unsigned long efer = vmcs_read64(GUEST_IA32_EFER);
859
860         if ((efer & (EFER_LME | EFER_LMA)) != EFER_LME)
861                 return;
862
863         efer |= EFER_LMA;
864         vmcs_write64(GUEST_IA32_EFER, efer);
865         vmcs_write32(VM_ENTRY_CONTROLS,
866                      vmcs_read32(VM_ENTRY_CONTROLS) | VM_ENTRY_IA32E_MODE);
867 }
868
869 static void vmx_handle_hypercall(struct registers *guest_regs,
870                                  struct per_cpu *cpu_data)
871 {
872         unsigned long code = guest_regs->rax;
873
874         vmx_skip_emulated_instruction(X86_INST_LEN_VMCALL);
875
876         if ((!(vmcs_read64(GUEST_IA32_EFER) & EFER_LMA) &&
877              vmcs_read64(GUEST_RFLAGS) & X86_RFLAGS_VM) ||
878             (vmcs_read16(GUEST_CS_SELECTOR) & 3) != 0) {
879                 guest_regs->rax = -EPERM;
880                 return;
881         }
882
883         guest_regs->rax = hypercall(cpu_data, code, guest_regs->rdi,
884                                     guest_regs->rsi);
885         if (guest_regs->rax == -ENOSYS)
886                 printk("CPU %d: Unknown vmcall %d, RIP: %p\n",
887                        cpu_data->cpu_id, code,
888                        vmcs_read64(GUEST_RIP) - X86_INST_LEN_VMCALL);
889
890         if (code == JAILHOUSE_HC_DISABLE && guest_regs->rax == 0)
891                 vmx_cpu_deactivate_vmm(guest_regs, cpu_data);
892 }
893
894 static bool vmx_handle_cr(struct registers *guest_regs,
895                           struct per_cpu *cpu_data)
896 {
897         u64 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
898         unsigned long cr, reg, val;
899
900         cr = exit_qualification & 0xf;
901         reg = (exit_qualification >> 8) & 0xf;
902
903         switch ((exit_qualification >> 4) & 3) {
904         case 0: /* move to cr */
905                 if (reg == 4)
906                         val = vmcs_read64(GUEST_RSP);
907                 else
908                         val = ((unsigned long *)guest_regs)[15 - reg];
909
910                 if (cr == 0 || cr == 4) {
911                         vmx_skip_emulated_instruction(X86_INST_LEN_MOV_TO_CR);
912                         /* TODO: check for #GP reasons */
913                         vmx_set_guest_cr(cr, val);
914                         if (cr == 0 && val & X86_CR0_PG)
915                                 update_efer();
916                         return true;
917                 }
918                 break;
919         default:
920                 break;
921         }
922         panic_printk("FATAL: Unhandled CR access, qualification %x\n",
923                      exit_qualification);
924         return false;
925 }
926
927 static bool
928 vmx_get_guest_paging_structs(struct guest_paging_structures *pg_structs)
929 {
930         if (vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE) {
931                 pg_structs->root_paging = x86_64_paging;
932                 pg_structs->root_table_gphys =
933                         vmcs_read64(GUEST_CR3) & 0x000ffffffffff000UL;
934         } else if (vmcs_read64(GUEST_CR0) & X86_CR0_PG &&
935                  !(vmcs_read64(GUEST_CR4) & X86_CR4_PAE)) {
936                 pg_structs->root_paging = i386_paging;
937                 pg_structs->root_table_gphys =
938                         vmcs_read64(GUEST_CR3) & 0xfffff000UL;
939         } else {
940                 printk("FATAL: Unsupported paging mode\n");
941                 return false;
942         }
943         return true;
944 }
945
946 static bool vmx_handle_apic_access(struct registers *guest_regs,
947                                    struct per_cpu *cpu_data)
948 {
949         struct guest_paging_structures pg_structs;
950         unsigned int inst_len, offset;
951         u64 qualification;
952         bool is_write;
953
954         qualification = vmcs_read64(EXIT_QUALIFICATION);
955
956         switch (qualification & APIC_ACCESS_TYPE_MASK) {
957         case APIC_ACCESS_TYPE_LINEAR_READ:
958         case APIC_ACCESS_TYPE_LINEAR_WRITE:
959                 is_write = !!(qualification & APIC_ACCESS_TYPE_LINEAR_WRITE);
960                 offset = qualification & APIC_ACCESS_OFFSET_MASK;
961                 if (offset & 0x00f)
962                         break;
963
964                 if (!vmx_get_guest_paging_structs(&pg_structs))
965                         break;
966
967                 inst_len = apic_mmio_access(guest_regs, cpu_data,
968                                             vmcs_read64(GUEST_RIP),
969                                             &pg_structs, offset >> 4,
970                                             is_write);
971                 if (!inst_len)
972                         break;
973
974                 vmx_skip_emulated_instruction(inst_len);
975                 return true;
976         }
977         panic_printk("FATAL: Unhandled APIC access, "
978                      "qualification %x\n", qualification);
979         return false;
980 }
981
982 static void dump_vm_exit_details(u32 reason)
983 {
984         panic_printk("qualification %x\n", vmcs_read64(EXIT_QUALIFICATION));
985         panic_printk("vectoring info: %x interrupt info: %x\n",
986                      vmcs_read32(IDT_VECTORING_INFO_FIELD),
987                      vmcs_read32(VM_EXIT_INTR_INFO));
988         if (reason == EXIT_REASON_EPT_VIOLATION ||
989             reason == EXIT_REASON_EPT_MISCONFIG)
990                 panic_printk("guest phys addr %p guest linear addr: %p\n",
991                              vmcs_read64(GUEST_PHYSICAL_ADDRESS),
992                              vmcs_read64(GUEST_LINEAR_ADDRESS));
993 }
994
995 static void dump_guest_regs(struct registers *guest_regs)
996 {
997         panic_printk("RIP: %p RSP: %p FLAGS: %x\n", vmcs_read64(GUEST_RIP),
998                      vmcs_read64(GUEST_RSP), vmcs_read64(GUEST_RFLAGS));
999         panic_printk("RAX: %p RBX: %p RCX: %p\n", guest_regs->rax,
1000                      guest_regs->rbx, guest_regs->rcx);
1001         panic_printk("RDX: %p RSI: %p RDI: %p\n", guest_regs->rdx,
1002                      guest_regs->rsi, guest_regs->rdi);
1003         panic_printk("CS: %x BASE: %p AR-BYTES: %x EFER.LMA %d\n",
1004                      vmcs_read64(GUEST_CS_SELECTOR),
1005                      vmcs_read64(GUEST_CS_BASE),
1006                      vmcs_read32(GUEST_CS_AR_BYTES),
1007                      !!(vmcs_read32(VM_ENTRY_CONTROLS) & VM_ENTRY_IA32E_MODE));
1008         panic_printk("CR0: %p CR3: %p CR4: %p\n", vmcs_read64(GUEST_CR0),
1009                      vmcs_read64(GUEST_CR3), vmcs_read64(GUEST_CR4));
1010         panic_printk("EFER: %p\n", vmcs_read64(GUEST_IA32_EFER));
1011 }
1012
1013 static bool vmx_handle_io_access(struct registers *guest_regs,
1014                                  struct per_cpu *cpu_data)
1015 {
1016         /* parse exit qualification for I/O instructions (see SDM, 27.2.1 ) */
1017         u64 exitq = vmcs_read64(EXIT_QUALIFICATION);
1018         u16 port = (exitq >> 16) & 0xFFFF;
1019         bool dir_in = (exitq & 0x8) >> 3;
1020         unsigned int size = (exitq & 0x3) + 1;
1021
1022         /* string and REP-prefixed instructions are not supported */
1023         if (exitq & 0x30)
1024                 goto invalid_access;
1025
1026         if (x86_pci_config_handler(guest_regs, cpu_data->cell, port, dir_in,
1027                                    size) == 1) {
1028                 vmx_skip_emulated_instruction(
1029                                 vmcs_read64(VM_EXIT_INSTRUCTION_LEN));
1030                 return true;
1031         }
1032
1033 invalid_access:
1034         panic_printk("FATAL: Invalid PIO %s, port: %x size: %d\n",
1035                      dir_in ? "read" : "write", port, size);
1036         panic_printk("PCI address port: %x\n",
1037                      cpu_data->cell->pci_addr_port_val);
1038         return false;
1039 }
1040
1041 static bool vmx_handle_ept_violation(struct registers *guest_regs,
1042                                      struct per_cpu *cpu_data)
1043 {
1044         u64 phys_addr = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
1045         u64 exitq = vmcs_read64(EXIT_QUALIFICATION);
1046         struct guest_paging_structures pg_structs;
1047         struct mmio_access access;
1048         int result = 0;
1049         bool is_write;
1050         u32 val;
1051
1052         /* We don't enable dirty/accessed bit updated in EPTP, so only read
1053          * of write flags can be set, not both. */
1054         is_write = !!(exitq & 0x2);
1055
1056         if (!vmx_get_guest_paging_structs(&pg_structs))
1057                 goto invalid_access;
1058
1059         access = mmio_parse(cpu_data, vmcs_read64(GUEST_RIP),
1060                             &pg_structs, is_write);
1061         if (!access.inst_len || access.size != 4)
1062                 goto invalid_access;
1063
1064         if (is_write)
1065                 val = ((unsigned long *)guest_regs)[access.reg];
1066
1067         result = ioapic_access_handler(cpu_data->cell, is_write, phys_addr,
1068                                        &val);
1069         if (result == 0)
1070                 result = pci_mmio_access_handler(cpu_data->cell, is_write,
1071                                                  phys_addr, &val);
1072
1073         if (result == 1) {
1074                 if (!is_write)
1075                         ((unsigned long *)guest_regs)[access.reg] = val;
1076                 vmx_skip_emulated_instruction(
1077                                 vmcs_read64(VM_EXIT_INSTRUCTION_LEN));
1078                 return true;
1079         }
1080
1081 invalid_access:
1082         /* report only unhandled access failures */
1083         if (result == 0)
1084                 panic_printk("FATAL: Invalid MMIO/RAM %s, addr: %p\n",
1085                              is_write ? "write" : "read", phys_addr);
1086         return false;
1087 }
1088
1089 void vmx_handle_exit(struct registers *guest_regs, struct per_cpu *cpu_data)
1090 {
1091         u32 reason = vmcs_read32(VM_EXIT_REASON);
1092         int sipi_vector;
1093
1094         cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_TOTAL]++;
1095
1096         switch (reason) {
1097         case EXIT_REASON_EXCEPTION_NMI:
1098                 asm volatile("int %0" : : "i" (NMI_VECTOR));
1099                 /* fall through */
1100         case EXIT_REASON_PREEMPTION_TIMER:
1101                 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_MANAGEMENT]++;
1102                 vmx_disable_preemption_timer();
1103                 sipi_vector = x86_handle_events(cpu_data);
1104                 if (sipi_vector >= 0) {
1105                         printk("CPU %d received SIPI, vector %x\n",
1106                                cpu_data->cpu_id, sipi_vector);
1107                         vmx_cpu_reset(cpu_data, sipi_vector);
1108                         memset(guest_regs, 0, sizeof(*guest_regs));
1109                 }
1110                 vtd_check_pending_faults(cpu_data);
1111                 return;
1112         case EXIT_REASON_CPUID:
1113                 vmx_skip_emulated_instruction(X86_INST_LEN_CPUID);
1114                 guest_regs->rax &= 0xffffffff;
1115                 guest_regs->rbx &= 0xffffffff;
1116                 guest_regs->rcx &= 0xffffffff;
1117                 guest_regs->rdx &= 0xffffffff;
1118                 __cpuid((u32 *)&guest_regs->rax, (u32 *)&guest_regs->rbx,
1119                         (u32 *)&guest_regs->rcx, (u32 *)&guest_regs->rdx);
1120                 return;
1121         case EXIT_REASON_VMCALL:
1122                 vmx_handle_hypercall(guest_regs, cpu_data);
1123                 return;
1124         case EXIT_REASON_CR_ACCESS:
1125                 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_CR]++;
1126                 if (vmx_handle_cr(guest_regs, cpu_data))
1127                         return;
1128                 break;
1129         case EXIT_REASON_MSR_READ:
1130                 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_MSR]++;
1131                 if (guest_regs->rcx >= MSR_X2APIC_BASE &&
1132                     guest_regs->rcx <= MSR_X2APIC_END) {
1133                         vmx_skip_emulated_instruction(X86_INST_LEN_RDMSR);
1134                         x2apic_handle_read(guest_regs);
1135                         return;
1136                 }
1137                 panic_printk("FATAL: Unhandled MSR read: %08x\n",
1138                              guest_regs->rcx);
1139                 break;
1140         case EXIT_REASON_MSR_WRITE:
1141                 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_MSR]++;
1142                 if (guest_regs->rcx == MSR_X2APIC_ICR) {
1143                         if (!apic_handle_icr_write(cpu_data, guest_regs->rax,
1144                                                    guest_regs->rdx))
1145                                 break;
1146                         vmx_skip_emulated_instruction(X86_INST_LEN_WRMSR);
1147                         return;
1148                 }
1149                 if (guest_regs->rcx >= MSR_X2APIC_BASE &&
1150                     guest_regs->rcx <= MSR_X2APIC_END) {
1151                         x2apic_handle_write(guest_regs);
1152                         vmx_skip_emulated_instruction(X86_INST_LEN_WRMSR);
1153                         return;
1154                 }
1155                 panic_printk("FATAL: Unhandled MSR write: %08x\n",
1156                              guest_regs->rcx);
1157                 break;
1158         case EXIT_REASON_APIC_ACCESS:
1159                 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_XAPIC]++;
1160                 if (vmx_handle_apic_access(guest_regs, cpu_data))
1161                         return;
1162                 break;
1163         case EXIT_REASON_XSETBV:
1164                 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_XSETBV]++;
1165                 if (guest_regs->rax & X86_XCR0_FP &&
1166                     (guest_regs->rax & ~cpuid_eax(0x0d)) == 0 &&
1167                     guest_regs->rcx == 0 && guest_regs->rdx == 0) {
1168                         vmx_skip_emulated_instruction(X86_INST_LEN_XSETBV);
1169                         asm volatile(
1170                                 "xsetbv"
1171                                 : /* no output */
1172                                 : "a" (guest_regs->rax), "c" (0), "d" (0));
1173                         return;
1174                 }
1175                 panic_printk("FATAL: Invalid xsetbv parameters: "
1176                              "xcr[%d] = %08x:%08x\n", guest_regs->rcx,
1177                              guest_regs->rdx, guest_regs->rax);
1178                 break;
1179         case EXIT_REASON_IO_INSTRUCTION:
1180                 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_PIO]++;
1181                 if (vmx_handle_io_access(guest_regs, cpu_data))
1182                         return;
1183                 break;
1184         case EXIT_REASON_EPT_VIOLATION:
1185                 cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_MMIO]++;
1186                 if (vmx_handle_ept_violation(guest_regs, cpu_data))
1187                         return;
1188                 break;
1189         default:
1190                 panic_printk("FATAL: %s, reason %d\n",
1191                              (reason & EXIT_REASONS_FAILED_VMENTRY) ?
1192                              "VM-Entry failure" : "Unhandled VM-Exit",
1193                              (u16)reason);
1194                 dump_vm_exit_details(reason);
1195                 break;
1196         }
1197         dump_guest_regs(guest_regs);
1198         panic_halt(cpu_data);
1199 }
1200
1201 void vmx_entry_failure(struct per_cpu *cpu_data)
1202 {
1203         panic_printk("FATAL: vmresume failed, error %d\n",
1204                      vmcs_read32(VM_INSTRUCTION_ERROR));
1205         panic_stop(cpu_data);
1206 }