9 static void resume_vm(Mword phys_vmcb, Mword *regs)
10 asm("resume_vm") __attribute__((__regparm__(3)));
11 Unsigned8 _asid[Config::Max_num_cpus];
12 Unsigned32 _asid_generation[Config::Max_num_cpus];
25 Vm_run = Task::Vm_ops + 0,
29 // ------------------------------------------------------------------------
30 INTERFACE [svm && debug]:
35 struct Log_vm_svm_exit
37 Mword exitcode, exitinfo1, exitinfo2, rip;
40 static unsigned log_fmt(Tb_entry *, int max, char *buf) asm ("__fmt_vm_svm_exit");
43 // ------------------------------------------------------------------------
47 #include "mem_space.h"
51 #include "thread.h" // XXX: circular dep, move this out here!
52 #include "thread_state.h" // XXX: circular dep, move this out here!
56 // ------------------------------------------------------------------------
57 IMPLEMENTATION [svm && ia32]:
66 Vm::get_vm_cr3(Vmcb *)
68 // When running in 32bit mode we already return the page-table of our Vm
69 // object, whether we're running with shadow or nested paging
70 return mem_space()->phys_dir();
73 //----------------------------------------------------------------------------
74 IMPLEMENTATION [svm && amd64]:
83 Vm::get_vm_cr3(Vmcb *v)
85 // When we have nested paging, we just return the 4lvl host page-table of
88 return mem_space()->phys_dir();
90 // When running with shadow paging and the guest is running in long mode
91 // and has paging enabled, we can just return the 4lvl page table of our
93 if ( (v->state_save_area.efer & EFER_LME)
94 && (v->state_save_area.cr0 & CR0_PG))
95 return mem_space()->phys_dir();
97 // Now it's getting tricky when running with shadow paging.
98 // We need to obey the following rules:
99 // - When the guest is not running in 64bit mode the CR3 one can set for
100 // the page-table must be below 4G physical memory (i.e. bit 32-63 must
101 // be zero). This is unfortunate when the host has memory above 4G as
102 // Fiasco gets its memory from the end of physical memory, i.e.
103 // page-table memory is above 4G.
104 // - We need an appropriate page-table format for 32bit!
105 // That means either a 2lvl page-table or a 3lvl PAE one. That would
106 // require to maintain two page-tables for the guest, one for 32bit
107 // mode execution and one for 64 bit execution. It is needed either for
108 // the transition from real to long-mode via protected mode or for
109 // 32bit only guests.
110 // There's one trick to avoid having two PTs: 4lvl-PTs and 3lvl-PAE-PTs
111 // have much in common so that it's possible to just take the the PDPE
112 // one of the host as the 3lvl-PAE-PT for the guest. Well, not quite.
113 // The problem is that SVM checks that MBZ bits in the PAE-PT entries
114 // are really 0 as written in the spec. Now the 4lvl PT contains rights
115 // bits there, so that this type of PT is refused and does not work on
117 // So why is the code still here? Well, QEmu isn't so picky about the
118 // bits in the PDPE and it thus works there...
119 Address vm_cr3 = mem_space()->dir()->walk(Virt_addr(0), 0).e->addr();
120 if (EXPECT_FALSE(!vm_cr3))
122 // force allocation of new secondary page-table level
123 mem_space()->dir()->alloc_cast<Mem_space_q_alloc>()
124 ->walk(Virt_addr(0), 1, Mem_space_q_alloc(ram_quota(),
125 Mapped_allocator::allocator()));
126 vm_cr3 = mem_space()->dir()->walk(Virt_addr(0), 0).e->addr();
129 if (EXPECT_FALSE(vm_cr3 >= 1UL << 32))
131 WARN("svm: Host page-table not under 4G, sorry.\n");
138 //----------------------------------------------------------------------------
139 IMPLEMENTATION [svm]:
141 class Mem_space_vm : public Mem_space
144 Mem_space_vm(Ram_quota *q) : Mem_space(q, false) {}
145 virtual Page_number map_max_address() const
146 { return Page_number::create(1UL << (MWORD_BITS - Page_shift)); }
149 struct Vm_space_factory
151 /** Create a usual Mem_space object. */
152 template< typename A1 >
153 static void create(Mem_space *v, A1 a1)
154 { new (v) Mem_space_vm(a1); }
156 template< typename S >
157 static void create(S *v)
165 static slab_cache_anon* slabs = new Kmem_slab_simple (sizeof (Vm),
173 Vm::operator new (size_t size, void *p)
176 assert (size == sizeof (Vm));
184 return _asid[current_cpu()];
189 Vm::asid (Unsigned8 asid)
191 _asid[current_cpu()] = asid;
196 Vm::asid_generation ()
198 return _asid_generation[current_cpu()];
203 Vm::asid_generation (Unsigned32 generation)
205 _asid_generation[current_cpu()] = generation;
210 : Task(Vm_space_factory(), q, L4_fpage(0))
212 memset(_asid, 0, sizeof(_asid));
213 memset(_asid_generation, 0, sizeof(_asid_generation));
219 Vm::create(Ram_quota *quota)
221 if (void *t = allocator()->q_alloc(quota))
223 Vm *a = new (t) Vm(quota);
234 // - force fpu ownership
235 // - debug registers not covered by VMCB
239 Vm::copy_state_save_area(Vmcb *dest, Vmcb *src)
241 Vmcb_state_save_area *d = &dest->state_save_area;
242 Vmcb_state_save_area *s = &src->state_save_area;
244 d->es_sel = s->es_sel;
245 d->es_attrib = s->es_attrib;
246 d->es_limit = s->es_limit;
247 d->es_base = s->es_base;
249 d->cs_sel = s->cs_sel;
250 d->cs_attrib = s->cs_attrib;
251 d->cs_limit = s->cs_limit;
252 d->cs_base = s->cs_base;
254 d->ss_sel = s->ss_sel;
255 d->ss_attrib = s->ss_attrib;
256 d->ss_limit = s->ss_limit;
257 d->ss_base = s->ss_base;
259 d->ds_sel = s->ds_sel;
260 d->ds_attrib = s->ds_attrib;
261 d->ds_limit = s->ds_limit;
262 d->ds_base = s->ds_base;
264 d->fs_sel = s->fs_sel;
265 d->fs_attrib = s->fs_attrib;
266 d->fs_limit = s->fs_limit;
267 d->fs_base = s->fs_base;
269 d->gs_sel = s->gs_sel;
270 d->gs_attrib = s->gs_attrib;
271 d->gs_limit = s->gs_limit;
272 d->gs_base = s->gs_base;
274 d->gdtr_sel = s->gdtr_sel;
275 d->gdtr_attrib = s->gdtr_attrib;
276 d->gdtr_limit = s->gdtr_limit;
277 d->gdtr_base = s->gdtr_base;
279 d->ldtr_sel = s->ldtr_sel;
280 d->ldtr_attrib = s->ldtr_attrib;
281 d->ldtr_limit = s->ldtr_limit;
282 d->ldtr_base = s->ldtr_base;
284 d->idtr_sel = s->idtr_sel;
285 d->idtr_attrib = s->idtr_attrib;
286 d->idtr_limit = s->idtr_limit;
287 d->idtr_base = s->idtr_base;
289 d->tr_sel = s->tr_sel;
290 d->tr_attrib = s->tr_attrib;
291 d->tr_limit = s->tr_limit;
292 d->tr_base = s->tr_base;
302 d->rflags = s->rflags;
311 d->sfmask = s->sfmask;
312 d->kernelgsbase = s->kernelgsbase;
313 d->sysenter_cs = s->sysenter_cs;
314 d->sysenter_esp = s->sysenter_esp;
315 d->sysenter_eip = s->sysenter_eip;
319 d->dbgctl = s->dbgctl;
320 d->br_from = s->br_from;
322 d->lastexcpfrom = s->lastexcpfrom;
323 d->last_excpto = s->last_excpto;
329 Vm::copy_control_area(Vmcb *dest, Vmcb *src)
331 Vmcb_control_area *d = &dest->control_area;
332 Vmcb_control_area *s = &src->control_area;
334 d->intercept_rd_crX = s->intercept_rd_crX;
335 d->intercept_wr_crX = s->intercept_wr_crX;
337 d->intercept_rd_drX = s->intercept_rd_drX;
338 d->intercept_wr_drX = s->intercept_wr_drX;
340 d->intercept_exceptions = s->intercept_exceptions;
342 d->intercept_instruction0 = s->intercept_instruction0;
343 d->intercept_instruction1 = s->intercept_instruction1;
345 // skip iopm_base_pa and msrpm_base_pa
347 d->tsc_offset = s->tsc_offset;
348 d->guest_asid_tlb_ctl = s->guest_asid_tlb_ctl;
349 d->interrupt_ctl = s->interrupt_ctl;
350 d->interrupt_shadow = s->interrupt_shadow;
351 d->exitcode = s->exitcode;
352 d->exitinfo1 = s->exitinfo1;
353 d->exitinfo2 = s->exitinfo2;
354 d->exitintinfo = s->exitintinfo;
355 d->np_enable = s->np_enable;
357 d->eventinj = s->eventinj;
359 d->lbr_virtualization_enable = s->lbr_virtualization_enable;
363 /* skip anything that does not change */
366 Vm::copy_control_area_back(Vmcb *dest, Vmcb *src)
368 Vmcb_control_area *d = &dest->control_area;
369 Vmcb_control_area *s = &src->control_area;
371 d->interrupt_ctl = s->interrupt_ctl;
372 d->interrupt_shadow = s->interrupt_shadow;
374 d->exitcode = s->exitcode;
375 d->exitinfo1 = s->exitinfo1;
376 d->exitinfo2 = s->exitinfo2;
377 d->exitintinfo = s->exitintinfo;
379 d->eventinj = s->eventinj;
382 /** \brief Choose an ASID for this Vm.
384 * Choose an ASID for this Vm. The ASID provided by userspace is ignored
385 * instead the kernel picks one.
386 * Userspace uses the flush-bit to receive a new ASID for this Vm.
387 * All ASIDs are flushed as soon as the kernel runs out of ASIDs.
389 * @param vmcb_s external VMCB provided by userspace
390 * @param kernel_vmcb_s our VMCB
395 Vm::configure_asid (Vmcb *vmcb_s, Vmcb *kernel_vmcb_s)
397 assert (cpu_lock.test());
399 Svm &s = Svm::cpus.cpu(current_cpu());
401 if (// vmm requests flush
402 ((vmcb_s->control_area.guest_asid_tlb_ctl >> 32) & 1) == 1 ||
403 // our asid is not valid or expired
404 !(s.asid_valid(asid(), asid_generation())))
407 asid_generation(s.global_asid_generation());
410 assert(s.asid_valid(asid(), asid_generation()));
412 kernel_vmcb_s->control_area.guest_asid_tlb_ctl = asid();
413 if (s.flush_all_asids())
415 kernel_vmcb_s->control_area.guest_asid_tlb_ctl |= (1ULL << 32);
416 s.flush_all_asids(false);
419 kernel_vmcb_s->control_area.guest_asid_tlb_ctl = 1;
420 kernel_vmcb_s->control_area.guest_asid_tlb_ctl |= (1ULL << 32);
425 Vm::sys_vm_run(Syscall_frame *f, Utcb *utcb)
428 Unsigned64 orig_cr3, orig_ncr3;
430 assert (cpu_lock.test());
432 /* these 4 must not use ldt entries */
433 assert (!(Cpu::get_cs() & (1 << 2)));
434 assert (!(Cpu::get_ss() & (1 << 2)));
435 assert (!(Cpu::get_ds() & (1 << 2)));
436 assert (!(Cpu::get_es() & (1 << 2)));
438 Svm &s = Svm::cpus.cpu(current_cpu());
440 L4_msg_tag const &tag = f->tag();
442 if (EXPECT_FALSE(!s.svm_enabled()))
444 WARN("svm: not supported/enabled\n");
445 return commit_result(-L4_err::EInval);
448 if (EXPECT_FALSE(tag.words() < 1 + Svm::Gpregs_words))
450 WARN("svm: Invalid message length\n");
451 return commit_result(-L4_err::EInval);
454 L4_snd_item_iter vmcb_item(utcb, tag.words());
456 if (EXPECT_FALSE(!tag.items() || !vmcb_item.next()))
457 return commit_result(-L4_err::EInval);
459 L4_fpage vmcb_fpage(vmcb_item.get()->d);
461 if (EXPECT_FALSE(!vmcb_fpage.is_mempage()))
463 WARN("svm: Fpage invalid\n");
464 return commit_error(utcb, L4_error::Overflow);
467 if (EXPECT_FALSE(vmcb_fpage.order() < 12))
468 return commit_result(-L4_err::EInval);
470 Vmcb *vmcb_s = (Vmcb *)(Virt_addr(vmcb_fpage.mem_address()).value());
471 Vmcb *kernel_vmcb_s = s.kernel_vmcb();
473 if (EXPECT_FALSE(vmcb_s->np_enabled() && !s.has_npt()))
475 WARN("svm: No NPT available\n");
476 return commit_result(-L4_err::EInval);
479 Address vm_cr3 = get_vm_cr3(vmcb_s);
480 // can only fail on 64bit, will be optimized away on 32bit
481 if (EXPECT_FALSE(is_64bit() && !vm_cr3))
482 return commit_result(-L4_err::ENomem);
484 Mem_space::Phys_addr phys_vmcb;
485 Mem_space::Size size;
487 unsigned int page_attribs;
489 Mem_space *const curr_mem_space = current()->space()->mem_space();
490 resident = curr_mem_space->v_lookup(Virt_addr(vmcb_s), &phys_vmcb, &size, &page_attribs);
494 WARN("svm: VMCB invalid\n");
495 return commit_result(-L4_err::EInval);
498 // currently only support for nested pagetables
499 // if shadow page tables are to be allowed then cr0
500 // needs further scrutiny and cr3 must not be accessible
501 if((vmcb_s->control_area.np_enable & 1) != 1)
502 return commit_result(-L4_err::EInval);
505 // neither EFER.LME nor EFER.LMA must be set
506 if (EXPECT_FALSE(!is_64bit()
507 && (vmcb_s->state_save_area.efer & (EFER_LME | EFER_LMA))))
509 WARN("svm: EFER invalid %llx\n", vmcb_s->state_save_area.efer);
510 return commit_result(-L4_err::EInval);
513 // EFER.SVME must be set
514 if (!(vmcb_s->state_save_area.efer & 0x1000))
516 WARN("svm: EFER invalid %llx\n", vmcb_s->state_save_area.efer);
517 return commit_result(-L4_err::EInval);
519 // allow PAE in combination with NPT
521 // CR4.PAE must be clear
522 if(vmcb_s->state_save_area.cr4 & 0x20)
523 return commit_result(-L4_err::EInval);
527 // This generates a circular dep between thread<->task, this cries for a
528 // new abstraction...
529 if (!(current()->state() & Thread_fpu_owner))
531 if (!current_thread()->switchin_fpu())
533 WARN("svm: switchin_fpu failed\n");
534 return commit_result(-L4_err::EInval);
538 #if 0 //should never happen
539 host_cr0 = Cpu::get_cr0();
540 // the VMM does not currently own the fpu but wants to
541 // make it available for the guest. This may happen
542 // if it was descheduled between activating the fpu and
543 // executing the vm_run operation
544 if (!(vmcb_s->state_save_area.cr0 & 0x8) && (host_cr0 & 0x8))
546 WARN("svm: FPU TS\n");
547 return commit_result(-L4_err::EInval);
551 // increment our refcount, and drop it at the end automatically
552 Ref_ptr<Vm> pin_myself(this);
556 orig_cr3 = vmcb_s->state_save_area.cr3;
557 orig_ncr3 = vmcb_s->control_area.n_cr3;
559 copy_control_area(kernel_vmcb_s, vmcb_s);
560 copy_state_save_area(kernel_vmcb_s, vmcb_s);
562 if (EXPECT_FALSE(is_64bit() && !kernel_vmcb_s->np_enabled()
563 && (kernel_vmcb_s->state_save_area.cr0 & CR0_PG)
564 && !(kernel_vmcb_s->state_save_area.cr4 & CR4_PAE)))
566 WARN("svm: No 32bit shadow page-tables on AMD64, use PAE!\n");
567 return commit_result(-L4_err::EInval);
570 // set MCE according to host
571 kernel_vmcb_s->state_save_area.cr4 |= Cpu::get_cr4() & CR4_MCE;
573 // allow w access to cr0, cr2, cr3
574 // allow r access to cr0, cr2, cr3, cr4
575 // to do: check if enabling PAE in cr4 needs to be controlled
577 // allow r/w access to dr[0-7]
578 kernel_vmcb_s->control_area.intercept_rd_drX |= 0xff00;
579 kernel_vmcb_s->control_area.intercept_wr_drX |= 0xff00;
582 // intercept exception vectors 0-31
583 kernel_vmcb_s->control_area.intercept_exceptions = 0xffffffff;
586 // enable iopm and msrpm
587 kernel_vmcb_s->control_area.intercept_instruction0 |= 0x18000000;
588 // intercept FERR_FREEZE and shutdown events
589 kernel_vmcb_s->control_area.intercept_instruction0 |= 0xc0000000;
590 // intercept INTR/NMI/SMI/INIT
591 kernel_vmcb_s->control_area.intercept_instruction0 |= 0xf;
593 kernel_vmcb_s->control_area.intercept_instruction0 |= (1 << 22);
595 kernel_vmcb_s->control_area.intercept_instruction0 |= (1 << 24);
596 // intercept task switch
597 kernel_vmcb_s->control_area.intercept_instruction0 |= (1 << 29);
598 // intercept shutdown
599 kernel_vmcb_s->control_area.intercept_instruction0 |= (1 << 31);
600 // intercept MONITOR/MWAIT
601 kernel_vmcb_s->control_area.intercept_instruction1 |= (1 << 10) | (1 << 11);
603 // intercept virtualization related instructions
604 // vmrun interception is required by the hardware
605 kernel_vmcb_s->control_area.intercept_instruction1 |= 0xff;
607 Mword kernel_vmcb_pa = s.kernel_vmcb_pa();
608 Unsigned64 iopm_base_pa = s.iopm_base_pa();
609 Unsigned64 msrpm_base_pa = s.msrpm_base_pa();
611 kernel_vmcb_s->control_area.iopm_base_pa = iopm_base_pa;
612 kernel_vmcb_s->control_area.msrpm_base_pa = msrpm_base_pa;
614 configure_asid(vmcb_s, kernel_vmcb_s);
616 // 7:0 V_TPR, 8 V_IRQ, 15:9 reserved SBZ,
617 // 19:16 V_INTR_PRIO, 20 V_IGN_TPR, 23:21 reserved SBZ
618 // 24 V_INTR_MASKING 31:25 reserved SBZ
619 // 39:32 V_INTR_VECTOR, 63:40 reserved SBZ
621 kernel_vmcb_s->control_area.interrupt_ctl = 0x10f0000;
623 // enable IRQ masking virtualization
624 kernel_vmcb_s->control_area.interrupt_ctl |= 0x01000000;
627 // 0 INTERRUPT_SHADOW, 31:1 reserved SBZ
628 // 63:32 reserved SBZ
629 kernel_vmcb_s->control_area.interrupt_shadow = 0;
632 kernel_vmcb_s->control_area.exitcode = 0;
633 kernel_vmcb_s->control_area.exitinfo1 = 0;
634 kernel_vmcb_s->control_area.exitinfo2 = 0;
635 kernel_vmcb_s->control_area.exitintinfo = 0;
638 // 0/1 NP_ENABLE, 31:1 reserved SBZ
639 kernel_vmcb_s->control_area.np_enable = 1;
641 // 31 VALID, EVENTINJ
642 kernel_vmcb_s->control_area.eventinj = 0;
646 kernel_vmcb_s->control_area.n_cr3 = vm_cr3;
648 if (!kernel_vmcb_s->np_enabled())
650 // to do: check that the vmtask has the
651 // VM property set, i.e. does not contain mappings
652 // to the fiasco kernel regions or runs with PL 3
654 // printf("nested paging disabled, use n_cr3 as cr3\n");
655 kernel_vmcb_s->state_save_area.cr3 = vm_cr3;
657 // intercept accesses to cr0, cr3 and cr4
658 kernel_vmcb_s->control_area.intercept_rd_crX = 0xfff9;
659 kernel_vmcb_s->control_area.intercept_wr_crX = 0xfff9;
663 kernel_vmcb_s->control_area.lbr_virtualization_enable = 0;
668 // - initialize VM_HSAVE_PA (done)
669 // - supply trusted msrpm_base_pa and iopm_base_pa (done)
670 // - save host state not covered by VMRUN/VMEXIT (ldt, some segments etc) (done)
671 // - disable interupts (done)
672 // - trigger interecepted device and timer interrupts (done, not necessary)
673 // - check host CR0.TS (floating point registers) (done)
675 Unsigned64 sysenter_cs, sysenter_eip, sysenter_esp;
680 sysenter_cs = Cpu::rdmsr(MSR_SYSENTER_CS);
681 sysenter_eip = Cpu::rdmsr(MSR_SYSENTER_EIP);
682 sysenter_esp = Cpu::rdmsr(MSR_SYSENTER_ESP);
687 ldtr = Cpu::get_ldt();
691 tr_entry = (*Cpu::cpus.cpu(current_cpu()).get_gdt())[tr / 8];
694 // to do: check if the nested page table walker looks
695 // into the TLB. if so, global pages have to be disabled in
697 cr4 = Cpu::get_cr4();
700 // disable support for global pages as the vm task has
701 // a divergent upper memory region from the regular tasks
702 Cpu::set_cr4(cr4 & ~CR4_PGE);
705 resume_vm(kernel_vmcb_pa, &utcb->values[1]);
713 Cpu::wrmsr(sysenter_cs, MSR_SYSENTER_CS);
714 Cpu::wrmsr(sysenter_eip, MSR_SYSENTER_EIP);
715 Cpu::wrmsr(sysenter_esp, MSR_SYSENTER_ESP);
724 tss_entry = (*Cpu::cpus.cpu(current_cpu()).get_gdt())[tr / 8];
725 tss_entry.access &= 0xfd;
726 (*Cpu::cpus.cpu(current_cpu()).get_gdt())[tr / 8] = tss_entry;
728 Cpu::set_tr(tr); // TODO move under stgi in asm
730 copy_state_save_area(vmcb_s, kernel_vmcb_s);
731 copy_control_area_back(vmcb_s, kernel_vmcb_s);
733 if (!(vmcb_s->np_enabled()))
734 vmcb_s->state_save_area.cr3 = orig_cr3;
736 vmcb_s->control_area.n_cr3 = orig_ncr3;
738 LOG_TRACE("VM-SVM", "svm", current(), __fmt_vm_svm_exit,
739 Log_vm_svm_exit *l = tbe->payload<Log_vm_svm_exit>();
740 l->exitcode = vmcb_s->control_area.exitcode;
741 l->exitinfo1 = vmcb_s->control_area.exitinfo1;
742 l->exitinfo2 = vmcb_s->control_area.exitinfo2;
743 l->rip = vmcb_s->state_save_area.rip;
746 return commit_result(L4_error::None);
751 Vm::invoke(L4_obj_ref obj, Mword rights, Syscall_frame *f, Utcb *utcb)
753 if (EXPECT_FALSE(f->tag().proto() != L4_msg_tag::Label_task))
755 f->tag(commit_result(-L4_err::EBadproto));
759 switch (utcb->values[0])
762 f->tag(sys_vm_run(f, utcb));
765 Task::invoke(obj, rights, f, utcb);
770 // ------------------------------------------------------------------------
771 IMPLEMENTATION [svm && debug]:
775 Vm::log_fmt(Tb_entry *e, int max, char *buf)
777 Log_vm_svm_exit *l = e->payload<Log_vm_svm_exit>();
778 return snprintf(buf, max, "ec=%lx ei1=%08lx ei2=%08lx rip=%08lx",
779 l->exitcode, l->exitinfo1, l->exitinfo2, l->rip);