]> rtime.felk.cvut.cz Git - jailhouse.git/blob - hypervisor/arch/x86/svm.c
x86: Add AMD-V hypercall handler
[jailhouse.git] / hypervisor / arch / x86 / svm.c
1 /*
2  * Jailhouse, a Linux-based partitioning hypervisor
3  *
4  * Copyright (c) Siemens AG, 2013
5  * Copyright (c) Valentine Sinitsyn, 2014
6  *
7  * Authors:
8  *  Jan Kiszka <jan.kiszka@siemens.com>
9  *  Valentine Sinitsyn <valentine.sinitsyn@gmail.com>
10  *
11  * Based on vmx.c written by Jan Kiszka.
12  *
13  * This work is licensed under the terms of the GNU GPL, version 2.  See
14  * the COPYING file in the top-level directory.
15  */
16
17 #include <jailhouse/entry.h>
18 #include <jailhouse/cell-config.h>
19 #include <jailhouse/control.h>
20 #include <jailhouse/paging.h>
21 #include <jailhouse/printk.h>
22 #include <jailhouse/processor.h>
23 #include <jailhouse/string.h>
24 #include <asm/apic.h>
25 #include <asm/cell.h>
26 #include <asm/paging.h>
27 #include <asm/percpu.h>
28 #include <asm/processor.h>
29 #include <asm/svm.h>
30 #include <asm/vcpu.h>
31
32 /*
33  * NW bit is ignored by all modern processors, however some
34  * combinations of NW and CD bits are prohibited by SVM (see APMv2,
35  * Sect. 15.5). To handle this, we always keep the NW bit off.
36  */
37 #define SVM_CR0_CLEARED_BITS    ~X86_CR0_NW
38
39 static bool has_avic, has_assists, has_flush_by_asid;
40
41 static const struct segment invalid_seg;
42
43 static struct paging npt_paging[NPT_PAGE_DIR_LEVELS];
44
45 static u8 __attribute__((aligned(PAGE_SIZE))) msrpm[][0x2000/4] = {
46         [ SVM_MSRPM_0000 ] = {
47                 [      0/4 ...  0x017/4 ] = 0,
48                 [  0x018/4 ...  0x01b/4 ] = 0x80, /* 0x01b (w) */
49                 [  0x01c/4 ...  0x7ff/4 ] = 0,
50                 /* x2APIC MSRs - emulated if not present */
51                 [  0x800/4 ...  0x803/4 ] = 0x90, /* 0x802 (r), 0x803 (r) */
52                 [  0x804/4 ...  0x807/4 ] = 0,
53                 [  0x808/4 ...  0x80b/4 ] = 0x93, /* 0x808 (rw), 0x80a (r), 0x80b (w) */
54                 [  0x80c/4 ...  0x80f/4 ] = 0xc8, /* 0x80d (w), 0x80f (rw) */
55                 [  0x810/4 ...  0x813/4 ] = 0x55, /* 0x810 - 0x813 (r) */
56                 [  0x814/4 ...  0x817/4 ] = 0x55, /* 0x814 - 0x817 (r) */
57                 [  0x818/4 ...  0x81b/4 ] = 0x55, /* 0x818 - 0x81b (r) */
58                 [  0x81c/4 ...  0x81f/4 ] = 0x55, /* 0x81c - 0x81f (r) */
59                 [  0x820/4 ...  0x823/4 ] = 0x55, /* 0x820 - 0x823 (r) */
60                 [  0x824/4 ...  0x827/4 ] = 0x55, /* 0x823 - 0x827 (r) */
61                 [  0x828/4 ...  0x82b/4 ] = 0x03, /* 0x828 (rw) */
62                 [  0x82c/4 ...  0x82f/4 ] = 0xc0, /* 0x82f (rw) */
63                 [  0x830/4 ...  0x833/4 ] = 0xf3, /* 0x830 (rw), 0x832 (rw), 0x833 (rw) */
64                 [  0x834/4 ...  0x837/4 ] = 0xff, /* 0x834 - 0x837 (rw) */
65                 [  0x838/4 ...  0x83b/4 ] = 0x07, /* 0x838 (rw), 0x839 (r) */
66                 [  0x83c/4 ...  0x83f/4 ] = 0x70, /* 0x83e (rw), 0x83f (r) */
67                 [  0x840/4 ... 0x1fff/4 ] = 0,
68         },
69         [ SVM_MSRPM_C000 ] = {
70                 [      0/4 ...  0x07f/4 ] = 0,
71                 [  0x080/4 ...  0x083/4 ] = 0x02, /* 0x080 (w) */
72                 [  0x084/4 ... 0x1fff/4 ] = 0
73         },
74         [ SVM_MSRPM_C001 ] = {
75                 [      0/4 ... 0x1fff/4 ] = 0,
76         },
77         [ SVM_MSRPM_RESV ] = {
78                 [      0/4 ... 0x1fff/4 ] = 0,
79         }
80 };
81
82 static void *avic_page;
83
84 static int svm_check_features(void)
85 {
86         /* SVM is available */
87         if (!(cpuid_ecx(0x80000001) & X86_FEATURE_SVM))
88                 return -ENODEV;
89
90         /* Nested paging */
91         if (!(cpuid_edx(0x8000000A) & X86_FEATURE_NP))
92                 return -EIO;
93
94         /* Decode assists */
95         if ((cpuid_edx(0x8000000A) & X86_FEATURE_DECODE_ASSISTS))
96                 has_assists = true;
97
98         /* AVIC support */
99         if (cpuid_edx(0x8000000A) & X86_FEATURE_AVIC)
100                 has_avic = true;
101
102         /* TLB Flush by ASID support */
103         if (cpuid_edx(0x8000000A) & X86_FEATURE_FLUSH_BY_ASID)
104                 has_flush_by_asid = true;
105
106         return 0;
107 }
108
109 static void set_svm_segment_from_dtr(struct svm_segment *svm_segment,
110                                      const struct desc_table_reg *dtr)
111 {
112         struct svm_segment tmp = { 0 };
113
114         if (dtr) {
115                 tmp.base = dtr->base;
116                 tmp.limit = dtr->limit & 0xffff;
117         }
118
119         *svm_segment = tmp;
120 }
121
122 /* TODO: struct segment needs to be x86 generic, not VMX-specific one here */
123 static void set_svm_segment_from_segment(struct svm_segment *svm_segment,
124                                          const struct segment *segment)
125 {
126         u32 ar;
127
128         svm_segment->selector = segment->selector;
129
130         if (segment->access_rights == 0x10000) {
131                 svm_segment->access_rights = 0;
132         } else {
133                 ar = segment->access_rights;
134                 svm_segment->access_rights =
135                         ((ar & 0xf000) >> 4) | (ar & 0x00ff);
136         }
137
138         svm_segment->limit = segment->limit;
139         svm_segment->base = segment->base;
140 }
141
142 static bool vcpu_set_cell_config(struct cell *cell, struct vmcb *vmcb)
143 {
144         /* No real need for this function; used for consistency with vmx.c */
145         vmcb->iopm_base_pa = paging_hvirt2phys(cell->svm.iopm);
146         vmcb->n_cr3 = paging_hvirt2phys(cell->svm.npt_structs.root_table);
147
148         return true;
149 }
150
151 static int vmcb_setup(struct per_cpu *cpu_data)
152 {
153         struct vmcb *vmcb = &cpu_data->vmcb;
154
155         memset(vmcb, 0, sizeof(struct vmcb));
156
157         vmcb->cr0 = read_cr0() & SVM_CR0_CLEARED_BITS;
158         vmcb->cr3 = cpu_data->linux_cr3;
159         vmcb->cr4 = read_cr4();
160
161         set_svm_segment_from_segment(&vmcb->cs, &cpu_data->linux_cs);
162         set_svm_segment_from_segment(&vmcb->ds, &cpu_data->linux_ds);
163         set_svm_segment_from_segment(&vmcb->es, &cpu_data->linux_es);
164         set_svm_segment_from_segment(&vmcb->fs, &cpu_data->linux_fs);
165         set_svm_segment_from_segment(&vmcb->gs, &cpu_data->linux_gs);
166         set_svm_segment_from_segment(&vmcb->ss, &invalid_seg);
167         set_svm_segment_from_segment(&vmcb->tr, &cpu_data->linux_tss);
168
169         set_svm_segment_from_dtr(&vmcb->ldtr, NULL);
170         set_svm_segment_from_dtr(&vmcb->gdtr, &cpu_data->linux_gdtr);
171         set_svm_segment_from_dtr(&vmcb->idtr, &cpu_data->linux_idtr);
172
173         vmcb->cpl = 0; /* Linux runs in ring 0 before migration */
174
175         vmcb->rflags = 0x02;
176         vmcb->rsp = cpu_data->linux_sp +
177                 (NUM_ENTRY_REGS + 1) * sizeof(unsigned long);
178         vmcb->rip = cpu_data->linux_ip;
179
180         vmcb->sysenter_cs = read_msr(MSR_IA32_SYSENTER_CS);
181         vmcb->sysenter_eip = read_msr(MSR_IA32_SYSENTER_EIP);
182         vmcb->sysenter_esp = read_msr(MSR_IA32_SYSENTER_ESP);
183         vmcb->star = read_msr(MSR_STAR);
184         vmcb->lstar = read_msr(MSR_LSTAR);
185         vmcb->cstar = read_msr(MSR_CSTAR);
186         vmcb->sfmask = read_msr(MSR_SFMASK);
187         vmcb->kerngsbase = read_msr(MSR_KERNGS_BASE);
188
189         vmcb->dr6 = 0x00000ff0;
190         vmcb->dr7 = 0x00000400;
191
192         /* Make the hypervisor visible */
193         vmcb->efer = (cpu_data->linux_efer | EFER_SVME);
194
195         /* Linux uses custom PAT setting */
196         vmcb->g_pat = read_msr(MSR_IA32_PAT);
197
198         vmcb->general1_intercepts |= GENERAL1_INTERCEPT_NMI;
199         vmcb->general1_intercepts |= GENERAL1_INTERCEPT_CR0_SEL_WRITE;
200         /* TODO: Do we need this for SVM ? */
201         /* vmcb->general1_intercepts |= GENERAL1_INTERCEPT_CPUID; */
202         vmcb->general1_intercepts |= GENERAL1_INTERCEPT_IOIO_PROT;
203         vmcb->general1_intercepts |= GENERAL1_INTERCEPT_MSR_PROT;
204         vmcb->general1_intercepts |= GENERAL1_INTERCEPT_SHUTDOWN_EVT;
205
206         vmcb->general2_intercepts |= GENERAL2_INTERCEPT_VMRUN; /* Required */
207         vmcb->general2_intercepts |= GENERAL2_INTERCEPT_VMMCALL;
208
209         vmcb->msrpm_base_pa = paging_hvirt2phys(msrpm);
210
211         vmcb->np_enable = 1;
212         /* No more than one guest owns the CPU */
213         vmcb->guest_asid = 1;
214
215         /* TODO: Setup AVIC */
216
217         return vcpu_set_cell_config(cpu_data->cell, vmcb);
218 }
219
220 unsigned long arch_paging_gphys2phys(struct per_cpu *cpu_data,
221                                      unsigned long gphys,
222                                      unsigned long flags)
223 {
224         return paging_virt2phys(&cpu_data->cell->svm.npt_structs,
225                         gphys, flags);
226 }
227
228 static void npt_set_next_pt(pt_entry_t pte, unsigned long next_pt)
229 {
230         /* See APMv2, Section 15.25.5 */
231         *pte = (next_pt & 0x000ffffffffff000UL) |
232                 (PAGE_DEFAULT_FLAGS | PAGE_FLAG_US);
233 }
234
235 int vcpu_vendor_init(void)
236 {
237         unsigned long vm_cr;
238         int err, n;
239
240         err = svm_check_features();
241         if (err)
242                 return err;
243
244         vm_cr = read_msr(MSR_VM_CR);
245         if (vm_cr & VM_CR_SVMDIS)
246                 /* SVM disabled in BIOS */
247                 return -EPERM;
248
249         /* Nested paging is the same as the native one */
250         memcpy(npt_paging, x86_64_paging, sizeof(npt_paging));
251         for (n = 0; n < NPT_PAGE_DIR_LEVELS; n++)
252                 npt_paging[n].set_next_pt = npt_set_next_pt;
253
254         /* This is always false for AMD now (except in nested SVM);
255            see Sect. 16.3.1 in APMv2 */
256         if (using_x2apic) {
257                 /* allow direct x2APIC access except for ICR writes */
258                 memset(&msrpm[SVM_MSRPM_0000][MSR_X2APIC_BASE/4], 0,
259                                 (MSR_X2APIC_END - MSR_X2APIC_BASE + 1)/4);
260                 msrpm[SVM_MSRPM_0000][MSR_X2APIC_ICR/4] = 0x02;
261         } else {
262                 /* Enable Extended Interrupt LVT */
263                 apic_reserved_bits[0x50] = 0;
264                 if (has_avic) {
265                         avic_page = page_alloc(&remap_pool, 1);
266                         if (!avic_page)
267                                 return -ENOMEM;
268                 }
269         }
270
271         return vcpu_cell_init(&root_cell);
272 }
273
274 int vcpu_vendor_cell_init(struct cell *cell)
275 {
276         u64 flags;
277         int err;
278
279         /* allocate iopm (two 4-K pages + 3 bits) */
280         cell->svm.iopm = page_alloc(&mem_pool, 3);
281         if (!cell->svm.iopm)
282                 return -ENOMEM;
283
284         /* build root NPT of cell */
285         cell->svm.npt_structs.root_paging = npt_paging;
286         cell->svm.npt_structs.root_table = page_alloc(&mem_pool, 1);
287         if (!cell->svm.npt_structs.root_table)
288                 return -ENOMEM;
289
290         if (!has_avic) {
291                 /*
292                  * Map xAPIC as is; reads are passed, writes are trapped.
293                  */
294                 flags = PAGE_READONLY_FLAGS |
295                         PAGE_FLAG_US |
296                         PAGE_FLAG_UNCACHED;
297                 err = paging_create(&cell->svm.npt_structs, XAPIC_BASE,
298                                     PAGE_SIZE, XAPIC_BASE,
299                                     flags,
300                                     PAGING_NON_COHERENT);
301         } else {
302                 flags = PAGE_DEFAULT_FLAGS | PAGE_FLAG_UNCACHED;
303                 err = paging_create(&cell->svm.npt_structs,
304                                     paging_hvirt2phys(avic_page),
305                                     PAGE_SIZE, XAPIC_BASE,
306                                     flags,
307                                     PAGING_NON_COHERENT);
308         }
309
310         return err;
311 }
312
313 int vcpu_map_memory_region(struct cell *cell,
314                            const struct jailhouse_memory *mem)
315 {
316         u64 phys_start = mem->phys_start;
317         u32 flags = PAGE_FLAG_US; /* See APMv2, Section 15.25.5 */
318
319         if (mem->flags & JAILHOUSE_MEM_READ)
320                 flags |= PAGE_FLAG_PRESENT;
321         if (mem->flags & JAILHOUSE_MEM_WRITE)
322                 flags |= PAGE_FLAG_RW;
323         if (mem->flags & JAILHOUSE_MEM_EXECUTE)
324                 flags |= PAGE_FLAG_EXECUTE;
325         if (mem->flags & JAILHOUSE_MEM_COMM_REGION)
326                 phys_start = paging_hvirt2phys(&cell->comm_page);
327
328         return paging_create(&cell->svm.npt_structs, phys_start, mem->size,
329                              mem->virt_start, flags, PAGING_NON_COHERENT);
330 }
331
332 int vcpu_unmap_memory_region(struct cell *cell,
333                              const struct jailhouse_memory *mem)
334 {
335         return paging_destroy(&cell->svm.npt_structs, mem->virt_start,
336                               mem->size, PAGING_NON_COHERENT);
337 }
338
339 void vcpu_vendor_cell_exit(struct cell *cell)
340 {
341         paging_destroy(&cell->svm.npt_structs, XAPIC_BASE, PAGE_SIZE,
342                        PAGING_NON_COHERENT);
343         page_free(&mem_pool, cell->svm.npt_structs.root_table, 1);
344 }
345
346 int vcpu_init(struct per_cpu *cpu_data)
347 {
348         unsigned long efer;
349         int err;
350
351         err = svm_check_features();
352         if (err)
353                 return err;
354
355         efer = read_msr(MSR_EFER);
356         if (efer & EFER_SVME)
357                 return -EBUSY;
358
359         efer |= EFER_SVME;
360         write_msr(MSR_EFER, efer);
361
362         cpu_data->svm_state = SVMON;
363
364         if (!vmcb_setup(cpu_data))
365                 return -EIO;
366
367         write_msr(MSR_VM_HSAVE_PA, paging_hvirt2phys(cpu_data->host_state));
368
369         /* Enable Extended Interrupt LVT (xAPIC, as it is AMD-only) */
370         if (!using_x2apic)
371                 apic_reserved_bits[0x50] = 0;
372
373         return 0;
374 }
375
376 void vcpu_exit(struct per_cpu *cpu_data)
377 {
378         unsigned long efer;
379
380         if (cpu_data->svm_state == SVMOFF)
381                 return;
382
383         cpu_data->svm_state = SVMOFF;
384
385         efer = read_msr(MSR_EFER);
386         efer &= ~EFER_SVME;
387         write_msr(MSR_EFER, efer);
388
389         write_msr(MSR_VM_HSAVE_PA, 0);
390 }
391
392 void vcpu_activate_vmm(struct per_cpu *cpu_data)
393 {
394         /* TODO: Implement */
395         __builtin_unreachable();
396 }
397
398 void __attribute__((noreturn))
399 vcpu_deactivate_vmm(struct registers *guest_regs)
400 {
401         /* TODO: Implement */
402         __builtin_unreachable();
403 }
404
405 void vcpu_skip_emulated_instruction(unsigned int inst_len)
406 {
407         struct per_cpu *cpu_data = this_cpu_data();
408         struct vmcb *vmcb = &cpu_data->vmcb;
409         vmcb->rip += inst_len;
410 }
411
412 bool vcpu_get_guest_paging_structs(struct guest_paging_structures *pg_structs)
413 {
414         struct per_cpu *cpu_data = this_cpu_data();
415         struct vmcb *vmcb = &cpu_data->vmcb;
416
417         if (vmcb->efer & EFER_LMA) {
418                 pg_structs->root_paging = x86_64_paging;
419                 pg_structs->root_table_gphys =
420                         vmcb->cr3 & 0x000ffffffffff000UL;
421         } else if ((vmcb->cr0 & X86_CR0_PG) &&
422                    !(vmcb->cr4 & X86_CR4_PAE)) {
423                 pg_structs->root_paging = i386_paging;
424                 pg_structs->root_table_gphys =
425                         vmcb->cr3 & 0xfffff000UL;
426         } else if (!(vmcb->cr0 & X86_CR0_PG)) {
427                 /*
428                  * Can be in non-paged protected mode as well, but
429                  * the translation mechanism will stay the same ayway.
430                  */
431                 pg_structs->root_paging = realmode_paging;
432                 /*
433                  * This will make paging_get_guest_pages map the page
434                  * that also contains the bootstrap code and, thus, is
435                  * always present in a cell.
436                  */
437                 pg_structs->root_table_gphys = 0xff000;
438         } else {
439                 printk("FATAL: Unsupported paging mode\n");
440                 return false;
441         }
442         return true;
443 }
444
445 static void dump_guest_regs(struct registers *guest_regs, struct vmcb *vmcb)
446 {
447         panic_printk("RIP: %p RSP: %p FLAGS: %x\n", vmcb->rip,
448                      vmcb->rsp, vmcb->rflags);
449         panic_printk("RAX: %p RBX: %p RCX: %p\n", guest_regs->rax,
450                      guest_regs->rbx, guest_regs->rcx);
451         panic_printk("RDX: %p RSI: %p RDI: %p\n", guest_regs->rdx,
452                      guest_regs->rsi, guest_regs->rdi);
453         panic_printk("CS: %x BASE: %p AR-BYTES: %x EFER.LMA %d\n",
454                      vmcb->cs.selector,
455                      vmcb->cs.base,
456                      vmcb->cs.access_rights,
457                      (vmcb->efer & EFER_LMA));
458         panic_printk("CR0: %p CR3: %p CR4: %p\n", vmcb->cr0,
459                      vmcb->cr3, vmcb->cr4);
460         panic_printk("EFER: %p\n", vmcb->efer);
461 }
462
463 void vcpu_handle_exit(struct registers *guest_regs, struct per_cpu *cpu_data)
464 {
465         struct vmcb *vmcb = &cpu_data->vmcb;
466         struct vcpu_execution_state x_state;
467
468         /* Restore GS value expected by per_cpu data accessors */
469         write_msr(MSR_GS_BASE, (unsigned long)cpu_data);
470
471         cpu_data->stats[JAILHOUSE_CPU_STAT_VMEXITS_TOTAL]++;
472
473         switch (vmcb->exitcode) {
474         case VMEXIT_INVALID:
475                 panic_printk("FATAL: VM-Entry failure, error %d\n",
476                              vmcb->exitcode);
477                 break;
478         case VMEXIT_CPUID:
479                 /* FIXME: We are not intercepting CPUID now */
480                 return;
481         case VMEXIT_VMMCALL:
482                 vcpu_vendor_get_execution_state(&x_state);
483                 vcpu_handle_hypercall(guest_regs, &x_state);
484                 return;
485         default:
486                 panic_printk("FATAL: Unexpected #VMEXIT, exitcode %x, "
487                              "exitinfo1 %p exitinfo2 %p\n",
488                              vmcb->exitcode, vmcb->exitinfo1, vmcb->exitinfo2);
489         }
490         dump_guest_regs(guest_regs, vmcb);
491         panic_park();
492 }
493
494 void vcpu_park(struct per_cpu *cpu_data)
495 {
496         /* TODO: Implement */
497 }
498
499 void vcpu_nmi_handler(struct per_cpu *cpu_data)
500 {
501         /* TODO: Implement */
502 }
503
504 void vcpu_tlb_flush(void)
505 {
506         struct per_cpu *cpu_data = this_cpu_data();
507         struct vmcb *vmcb = &cpu_data->vmcb;
508
509         if (has_flush_by_asid)
510                 vmcb->tlb_control = SVM_TLB_FLUSH_GUEST;
511         else
512                 vmcb->tlb_control = SVM_TLB_FLUSH_ALL;
513 }
514
515 const u8 *vcpu_get_inst_bytes(const struct guest_paging_structures *pg_structs,
516                               unsigned long pc, unsigned int *size)
517 {
518         struct per_cpu *cpu_data = this_cpu_data();
519         struct vmcb *vmcb = &cpu_data->vmcb;
520         unsigned long start;
521
522         if (has_assists) {
523                 if (!*size)
524                         return NULL;
525                 start = vmcb->rip - pc;
526                 if (start < vmcb->bytes_fetched) {
527                         *size = vmcb->bytes_fetched - start;
528                         return &vmcb->guest_bytes[start];
529                 } else {
530                         return NULL;
531                 }
532         } else {
533                 return vcpu_map_inst(pg_structs, pc, size);
534         }
535 }
536
537 void vcpu_vendor_get_cell_io_bitmap(struct cell *cell,
538                                     struct vcpu_io_bitmap *iobm)
539 {
540         iobm->data = cell->svm.iopm;
541         iobm->size = sizeof(cell->svm.iopm);
542 }
543
544 void vcpu_vendor_get_execution_state(struct vcpu_execution_state *x_state)
545 {
546         struct per_cpu *cpu_data = this_cpu_data();
547
548         x_state->efer = cpu_data->vmcb.efer;
549         x_state->rflags = cpu_data->vmcb.rflags;
550         x_state->cs = cpu_data->vmcb.cs.selector;
551         x_state->rip = cpu_data->vmcb.rip;
552 }