8 class Vm_svm : public Vm
11 static void resume_vm_svm(Mword phys_vmcb, Vcpu_state *regs)
12 asm("resume_vm_svm") __attribute__((__regparm__(3)));
14 typedef Per_cpu_array<Unsigned8> Asid_array;
15 typedef Per_cpu_array<Unsigned32> Asid_version_array;
18 Asid_version_array _asid_generation;
27 // ------------------------------------------------------------------------
28 INTERFACE [svm && debug]:
32 EXTENSION class Vm_svm
35 struct Log_vm_svm_exit : public Tb_entry
37 Mword exitcode, exitinfo1, exitinfo2, rip;
38 unsigned print(int max, char *buf) const;
43 // ------------------------------------------------------------------------
47 #include "mem_space.h"
51 #include "thread.h" // XXX: circular dep, move this out here!
52 #include "thread_state.h" // XXX: circular dep, move this out here!
55 // ------------------------------------------------------------------------
56 IMPLEMENTATION [svm && ia32]:
60 PRIVATE inline NEEDS["virt.h"]
62 Vm_svm::get_vm_cr3(Vmcb *)
64 // When running in 32bit mode we already return the page-table of our Vm
65 // object, whether we're running with shadow or nested paging
69 //----------------------------------------------------------------------------
70 IMPLEMENTATION [svm && amd64]:
72 #include "assert_opt.h"
75 PRIVATE inline NEEDS["assert_opt.h", "virt.h"]
77 Vm_svm::get_vm_cr3(Vmcb *v)
79 // When we have nested paging, we just return the 4lvl host page-table of
84 // When running with shadow paging and the guest is running in long mode
85 // and has paging enabled, we can just return the 4lvl page table of our
87 if ( (v->state_save_area.efer & EFER_LME)
88 && (v->state_save_area.cr0 & CR0_PG))
91 // Now it's getting tricky when running with shadow paging.
92 // We need to obey the following rules:
93 // - When the guest is not running in 64bit mode the CR3 one can set for
94 // the page-table must be below 4G physical memory (i.e. bit 32-63 must
95 // be zero). This is unfortunate when the host has memory above 4G as
96 // Fiasco gets its memory from the end of physical memory, i.e.
97 // page-table memory is above 4G.
98 // - We need an appropriate page-table format for 32bit!
99 // That means either a 2lvl page-table or a 3lvl PAE one. That would
100 // require to maintain two page-tables for the guest, one for 32bit
101 // mode execution and one for 64 bit execution. It is needed either for
102 // the transition from real to long-mode via protected mode or for
103 // 32bit only guests.
104 // There's one trick to avoid having two PTs: 4lvl-PTs and 3lvl-PAE-PTs
105 // have much in common so that it's possible to just take the the PDPE
106 // one of the host as the 3lvl-PAE-PT for the guest. Well, not quite.
107 // The problem is that SVM checks that MBZ bits in the PAE-PT entries
108 // are really 0 as written in the spec. Now the 4lvl PT contains rights
109 // bits there, so that this type of PT is refused and does not work on
111 // So why is the code still here? Well, QEmu isn't so picky about the
112 // bits in the PDPE and it thus works there...
114 Address vm_cr3 = static_cast<Mem_space*>(this)->dir()->walk(Virt_addr(0), 0).next_level();
115 if (EXPECT_FALSE(!vm_cr3))
117 // force allocation of new secondary page-table level
118 static_cast<Mem_space*>(this)->dir()
119 ->walk(Virt_addr(0), 1, false, Kmem_alloc::q_allocator(ram_quota()));
120 vm_cr3 = static_cast<Mem_space*>(this)->dir()->walk(Virt_addr(0), 0).next_level();
123 if (EXPECT_FALSE(vm_cr3 >= 1UL << 32))
125 WARN("svm: Host page-table not under 4G, sorry.\n");
132 //----------------------------------------------------------------------------
133 IMPLEMENTATION [svm]:
139 return _asid[current_cpu()];
144 Vm_svm::asid(Unsigned8 asid)
146 _asid[current_cpu()] = asid;
151 Vm_svm::asid_generation()
153 return _asid_generation[current_cpu()];
158 Vm_svm::asid_generation(Unsigned32 generation)
160 _asid_generation[current_cpu()] = generation;
164 Vm_svm::Vm_svm(Ram_quota *q)
167 memset(&_asid, 0, sizeof(_asid));
168 memset(&_asid_generation, 0, sizeof(_asid_generation));
173 Vm_svm::operator new (size_t size, void *p) throw()
176 assert (size == sizeof (Vm_svm));
182 Vm_svm::operator delete (void *ptr)
184 Vm_svm *t = reinterpret_cast<Vm_svm*>(ptr);
185 allocator<Vm_svm>()->q_free(t->ram_quota(), ptr);
191 // - force fpu ownership
192 // - debug registers not covered by VMCB
196 Vm_svm::copy_state_save_area(Vmcb *dest, Vmcb *src)
198 Vmcb_state_save_area *d = &dest->state_save_area;
199 Vmcb_state_save_area *s = &src->state_save_area;
201 d->es_sel = s->es_sel;
202 d->es_attrib = s->es_attrib;
203 d->es_limit = s->es_limit;
204 d->es_base = s->es_base;
206 d->cs_sel = s->cs_sel;
207 d->cs_attrib = s->cs_attrib;
208 d->cs_limit = s->cs_limit;
209 d->cs_base = s->cs_base;
211 d->ss_sel = s->ss_sel;
212 d->ss_attrib = s->ss_attrib;
213 d->ss_limit = s->ss_limit;
214 d->ss_base = s->ss_base;
216 d->ds_sel = s->ds_sel;
217 d->ds_attrib = s->ds_attrib;
218 d->ds_limit = s->ds_limit;
219 d->ds_base = s->ds_base;
221 d->fs_sel = s->fs_sel;
222 d->fs_attrib = s->fs_attrib;
223 d->fs_limit = s->fs_limit;
224 d->fs_base = s->fs_base;
226 d->gs_sel = s->gs_sel;
227 d->gs_attrib = s->gs_attrib;
228 d->gs_limit = s->gs_limit;
229 d->gs_base = s->gs_base;
231 d->gdtr_sel = s->gdtr_sel;
232 d->gdtr_attrib = s->gdtr_attrib;
233 d->gdtr_limit = s->gdtr_limit;
234 d->gdtr_base = s->gdtr_base;
236 d->ldtr_sel = s->ldtr_sel;
237 d->ldtr_attrib = s->ldtr_attrib;
238 d->ldtr_limit = s->ldtr_limit;
239 d->ldtr_base = s->ldtr_base;
241 d->idtr_sel = s->idtr_sel;
242 d->idtr_attrib = s->idtr_attrib;
243 d->idtr_limit = s->idtr_limit;
244 d->idtr_base = s->idtr_base;
246 d->tr_sel = s->tr_sel;
247 d->tr_attrib = s->tr_attrib;
248 d->tr_limit = s->tr_limit;
249 d->tr_base = s->tr_base;
259 d->rflags = s->rflags;
268 d->sfmask = s->sfmask;
269 d->kernelgsbase = s->kernelgsbase;
270 d->sysenter_cs = s->sysenter_cs;
271 d->sysenter_esp = s->sysenter_esp;
272 d->sysenter_eip = s->sysenter_eip;
276 d->dbgctl = s->dbgctl;
277 d->br_from = s->br_from;
279 d->lastexcpfrom = s->lastexcpfrom;
280 d->last_excpto = s->last_excpto;
286 Vm_svm::copy_control_area(Vmcb *dest, Vmcb *src)
288 Vmcb_control_area *d = &dest->control_area;
289 Vmcb_control_area *s = &src->control_area;
291 d->intercept_rd_crX = s->intercept_rd_crX;
292 d->intercept_wr_crX = s->intercept_wr_crX;
294 d->intercept_rd_drX = s->intercept_rd_drX;
295 d->intercept_wr_drX = s->intercept_wr_drX;
297 d->intercept_exceptions = s->intercept_exceptions;
299 d->intercept_instruction0 = s->intercept_instruction0;
300 d->intercept_instruction1 = s->intercept_instruction1;
302 // skip iopm_base_pa and msrpm_base_pa
304 d->tsc_offset = s->tsc_offset;
305 d->guest_asid_tlb_ctl = s->guest_asid_tlb_ctl;
306 d->interrupt_ctl = s->interrupt_ctl;
307 d->interrupt_shadow = s->interrupt_shadow;
308 d->exitcode = s->exitcode;
309 d->exitinfo1 = s->exitinfo1;
310 d->exitinfo2 = s->exitinfo2;
311 d->exitintinfo = s->exitintinfo;
312 d->np_enable = s->np_enable;
314 d->eventinj = s->eventinj;
316 d->lbr_virtualization_enable = s->lbr_virtualization_enable;
320 /* skip anything that does not change */
323 Vm_svm::copy_control_area_back(Vmcb *dest, Vmcb *src)
325 Vmcb_control_area *d = &dest->control_area;
326 Vmcb_control_area *s = &src->control_area;
328 d->interrupt_ctl = s->interrupt_ctl;
329 d->interrupt_shadow = s->interrupt_shadow;
331 d->exitcode = s->exitcode;
332 d->exitinfo1 = s->exitinfo1;
333 d->exitinfo2 = s->exitinfo2;
334 d->exitintinfo = s->exitintinfo;
336 d->eventinj = s->eventinj;
339 /** \brief Choose an ASID for this Vm.
341 * Choose an ASID for this Vm. The ASID provided by userspace is ignored
342 * instead the kernel picks one.
343 * Userspace uses the flush-bit to receive a new ASID for this Vm.
344 * All ASIDs are flushed as soon as the kernel runs out of ASIDs.
346 * @param vmcb_s external VMCB provided by userspace
347 * @param kernel_vmcb_s our VMCB
352 Vm_svm::configure_asid(Vmcb *vmcb_s, Vmcb *kernel_vmcb_s)
354 assert (cpu_lock.test());
356 Svm &s = Svm::cpus.cpu(current_cpu());
358 if (// vmm requests flush
359 ((vmcb_s->control_area.guest_asid_tlb_ctl >> 32) & 1) == 1 ||
360 // our asid is not valid or expired
361 !(s.asid_valid(asid(), asid_generation())))
364 asid_generation(s.global_asid_generation());
367 assert(s.asid_valid(asid(), asid_generation()));
369 kernel_vmcb_s->control_area.guest_asid_tlb_ctl = asid();
370 if (s.flush_all_asids())
372 kernel_vmcb_s->control_area.guest_asid_tlb_ctl |= (1ULL << 32);
373 s.flush_all_asids(false);
376 kernel_vmcb_s->control_area.guest_asid_tlb_ctl = 1;
377 kernel_vmcb_s->control_area.guest_asid_tlb_ctl |= (1ULL << 32);
381 PRIVATE inline NOEXPORT
383 Vm_svm::do_resume_vcpu(Context *ctxt, Vcpu_state *vcpu, Vmcb *vmcb_s)
386 Unsigned64 orig_cr3, orig_ncr3;
388 assert (cpu_lock.test());
390 /* these 4 must not use ldt entries */
391 assert (!(Cpu::get_cs() & (1 << 2)));
392 assert (!(Cpu::get_ss() & (1 << 2)));
393 assert (!(Cpu::get_ds() & (1 << 2)));
394 assert (!(Cpu::get_es() & (1 << 2)));
396 Svm &s = Svm::cpus.cpu(current_cpu());
398 // FIXME: this can be an assertion I think, however, think about MP
399 if (EXPECT_FALSE(!s.svm_enabled()))
401 WARN("svm: not supported/enabled\n");
402 return -L4_err::EInval;
405 if (EXPECT_FALSE(vmcb_s->np_enabled() && !s.has_npt()))
407 WARN("svm: No NPT available\n");
408 return -L4_err::EInval;
411 Address vm_cr3 = get_vm_cr3(vmcb_s);
412 // can only fail on 64bit, will be optimized away on 32bit
413 if (EXPECT_FALSE(is_64bit() && !vm_cr3))
414 return -L4_err::ENomem;
416 // neither EFER.LME nor EFER.LMA must be set
417 if (EXPECT_FALSE(!is_64bit()
418 && (vmcb_s->state_save_area.efer & (EFER_LME | EFER_LMA))))
420 WARN("svm: EFER invalid %llx\n", vmcb_s->state_save_area.efer);
421 return -L4_err::EInval;
424 // EFER.SVME must be set
425 if (!(vmcb_s->state_save_area.efer & 0x1000))
427 WARN("svm: EFER invalid %llx\n", vmcb_s->state_save_area.efer);
428 return -L4_err::EInval;
430 // allow PAE in combination with NPT
432 // CR4.PAE must be clear
433 if(vmcb_s->state_save_area.cr4 & 0x20)
434 return -L4_err::EInval;
438 // This generates a circular dep between thread<->task, this cries for a
439 // new abstraction...
440 if (!(ctxt->state() & Thread_fpu_owner))
442 if (!static_cast<Thread*>(ctxt)->switchin_fpu())
444 WARN("svm: switchin_fpu failed\n");
445 return -L4_err::EInval;
449 #if 0 //should never happen
450 host_cr0 = Cpu::get_cr0();
451 // the VMM does not currently own the fpu but wants to
452 // make it available for the guest. This may happen
453 // if it was descheduled between activating the fpu and
454 // executing the vm_run operation
455 if (!(vmcb_s->state_save_area.cr0 & 0x8) && (host_cr0 & 0x8))
457 WARN("svm: FPU TS\n");
458 return commit_result(-L4_err::EInval);
462 // increment our refcount, and drop it at the end automatically
463 Ref_ptr<Vm_svm> pin_myself(this);
467 orig_cr3 = vmcb_s->state_save_area.cr3;
468 orig_ncr3 = vmcb_s->control_area.n_cr3;
470 Vmcb *kernel_vmcb_s = s.kernel_vmcb();
472 copy_control_area(kernel_vmcb_s, vmcb_s);
473 copy_state_save_area(kernel_vmcb_s, vmcb_s);
475 if (EXPECT_FALSE(is_64bit() && !kernel_vmcb_s->np_enabled()
476 && (kernel_vmcb_s->state_save_area.cr0 & CR0_PG)
477 && !(kernel_vmcb_s->state_save_area.cr4 & CR4_PAE)))
479 WARN("svm: No 32bit shadow page-tables on AMD64, use PAE!\n");
480 return -L4_err::EInval;
483 // set MCE according to host
484 kernel_vmcb_s->state_save_area.cr4 |= Cpu::get_cr4() & CR4_MCE;
486 // allow w access to cr0, cr2, cr3
487 // allow r access to cr0, cr2, cr3, cr4
488 // to do: check if enabling PAE in cr4 needs to be controlled
490 // allow r/w access to dr[0-7]
491 kernel_vmcb_s->control_area.intercept_rd_drX |= 0xff00;
492 kernel_vmcb_s->control_area.intercept_wr_drX |= 0xff00;
495 // intercept exception vectors 0-31
496 kernel_vmcb_s->control_area.intercept_exceptions = 0xffffffff;
499 // enable iopm and msrpm
500 kernel_vmcb_s->control_area.intercept_instruction0 |= 0x18000000;
501 // intercept FERR_FREEZE and shutdown events
502 kernel_vmcb_s->control_area.intercept_instruction0 |= 0xc0000000;
503 // intercept INTR/NMI/SMI/INIT
504 kernel_vmcb_s->control_area.intercept_instruction0 |= 0xf;
506 kernel_vmcb_s->control_area.intercept_instruction0 |= (1 << 22);
508 kernel_vmcb_s->control_area.intercept_instruction0 |= (1 << 24);
509 // intercept task switch
510 kernel_vmcb_s->control_area.intercept_instruction0 |= (1 << 29);
511 // intercept shutdown
512 kernel_vmcb_s->control_area.intercept_instruction0 |= (1 << 31);
513 // intercept MONITOR/MWAIT
514 kernel_vmcb_s->control_area.intercept_instruction1 |= (1 << 10) | (1 << 11);
516 // intercept virtualization related instructions
517 // vmrun interception is required by the hardware
518 kernel_vmcb_s->control_area.intercept_instruction1 |= 0xff;
520 Mword kernel_vmcb_pa = s.kernel_vmcb_pa();
521 Unsigned64 iopm_base_pa = s.iopm_base_pa();
522 Unsigned64 msrpm_base_pa = s.msrpm_base_pa();
524 kernel_vmcb_s->control_area.iopm_base_pa = iopm_base_pa;
525 kernel_vmcb_s->control_area.msrpm_base_pa = msrpm_base_pa;
527 configure_asid(vmcb_s, kernel_vmcb_s);
529 // 7:0 V_TPR, 8 V_IRQ, 15:9 reserved SBZ,
530 // 19:16 V_INTR_PRIO, 20 V_IGN_TPR, 23:21 reserved SBZ
531 // 24 V_INTR_MASKING 31:25 reserved SBZ
532 // 39:32 V_INTR_VECTOR, 63:40 reserved SBZ
534 kernel_vmcb_s->control_area.interrupt_ctl = 0x10f0000;
536 // enable IRQ masking virtualization
537 kernel_vmcb_s->control_area.interrupt_ctl |= 0x01000000;
540 // 0 INTERRUPT_SHADOW, 31:1 reserved SBZ
541 // 63:32 reserved SBZ
542 kernel_vmcb_s->control_area.interrupt_shadow = 0;
545 kernel_vmcb_s->control_area.exitcode = 0;
546 kernel_vmcb_s->control_area.exitinfo1 = 0;
547 kernel_vmcb_s->control_area.exitinfo2 = 0;
548 kernel_vmcb_s->control_area.exitintinfo = 0;
551 // 0/1 NP_ENABLE, 31:1 reserved SBZ
552 kernel_vmcb_s->control_area.np_enable = 1;
554 // 31 VALID, EVENTINJ
555 kernel_vmcb_s->control_area.eventinj = 0;
559 kernel_vmcb_s->control_area.n_cr3 = vm_cr3;
561 if (!kernel_vmcb_s->np_enabled())
563 // to do: check that the vmtask has the
564 // VM property set, i.e. does not contain mappings
565 // to the fiasco kernel regions or runs with PL 3
567 // printf("nested paging disabled, use n_cr3 as cr3\n");
568 kernel_vmcb_s->state_save_area.cr3 = vm_cr3;
570 // intercept accesses to cr0, cr3 and cr4
571 kernel_vmcb_s->control_area.intercept_rd_crX = 0xfff9;
572 kernel_vmcb_s->control_area.intercept_wr_crX = 0xfff9;
576 kernel_vmcb_s->control_area.lbr_virtualization_enable = 0;
581 // - initialize VM_HSAVE_PA (done)
582 // - supply trusted msrpm_base_pa and iopm_base_pa (done)
583 // - save host state not covered by VMRUN/VMEXIT (ldt, some segments etc) (done)
584 // - disable interupts (done)
585 // - trigger interecepted device and timer interrupts (done, not necessary)
586 // - check host CR0.TS (floating point registers) (done)
588 Unsigned64 sysenter_cs, sysenter_eip, sysenter_esp;
593 sysenter_cs = Cpu::rdmsr(MSR_SYSENTER_CS);
594 sysenter_eip = Cpu::rdmsr(MSR_SYSENTER_EIP);
595 sysenter_esp = Cpu::rdmsr(MSR_SYSENTER_ESP);
600 ldtr = Cpu::get_ldt();
604 tr_entry = (*Cpu::cpus.cpu(current_cpu()).get_gdt())[tr / 8];
607 // to do: check if the nested page table walker looks
608 // into the TLB. if so, global pages have to be disabled in
610 cr4 = Cpu::get_cr4();
613 // disable support for global pages as the vm task has
614 // a divergent upper memory region from the regular tasks
615 Cpu::set_cr4(cr4 & ~CR4_PGE);
618 resume_vm_svm(kernel_vmcb_pa, vcpu);
626 Cpu::wrmsr(sysenter_cs, MSR_SYSENTER_CS);
627 Cpu::wrmsr(sysenter_eip, MSR_SYSENTER_EIP);
628 Cpu::wrmsr(sysenter_esp, MSR_SYSENTER_ESP);
637 tss_entry = (*Cpu::cpus.cpu(current_cpu()).get_gdt())[tr / 8];
638 tss_entry.access &= 0xfd;
639 (*Cpu::cpus.cpu(current_cpu()).get_gdt())[tr / 8] = tss_entry;
641 Cpu::set_tr(tr); // TODO move under stgi in asm
643 copy_state_save_area(vmcb_s, kernel_vmcb_s);
644 copy_control_area_back(vmcb_s, kernel_vmcb_s);
646 if (!(vmcb_s->np_enabled()))
647 vmcb_s->state_save_area.cr3 = orig_cr3;
649 vmcb_s->control_area.n_cr3 = orig_ncr3;
651 LOG_TRACE("VM-SVM", "svm", current(), Log_vm_svm_exit,
652 l->exitcode = vmcb_s->control_area.exitcode;
653 l->exitinfo1 = vmcb_s->control_area.exitinfo1;
654 l->exitinfo2 = vmcb_s->control_area.exitinfo2;
655 l->rip = vmcb_s->state_save_area.rip;
658 // check for IRQ exit
659 if (kernel_vmcb_s->control_area.exitcode == 0x60)
662 vcpu->state &= ~(Vcpu_state::F_traps | Vcpu_state::F_user_mode);
668 Vm_svm::resume_vcpu(Context *ctxt, Vcpu_state *vcpu, bool user_mode)
671 assert_kdb (user_mode);
673 if (EXPECT_FALSE(!(ctxt->state(true) & Thread_ext_vcpu_enabled)))
675 ctxt->arch_load_vcpu_kern_state(vcpu, true);
676 return -L4_err::EInval;
679 Vmcb *vmcb_s = reinterpret_cast<Vmcb*>(reinterpret_cast<char *>(vcpu) + 0x400);
682 // in the case of disabled IRQs and a pending IRQ directly simulate an
683 // external interrupt intercept
684 if ( !(vcpu->_saved_state & Vcpu_state::F_irqs)
685 && (vcpu->sticky_flags & Vcpu_state::Sf_irq_pending))
687 vmcb_s->control_area.exitcode = 0x60;
688 ctxt->arch_load_vcpu_kern_state(vcpu, true);
689 return 1; // return 1 to indicate pending IRQs (IPCs)
692 int r = do_resume_vcpu(ctxt, vcpu, vmcb_s);
694 // test for error or non-IRQ exit reason
697 ctxt->arch_load_vcpu_kern_state(vcpu, true);
701 // check for IRQ exits and allow to handle the IRQ
703 Proc::preemption_point();
705 // Check if the current context got a message delivered.
706 // This is done by testing for a valid continuation.
707 // When a continuation is set we have to directly
708 // leave the kernel to not overwrite the vcpu-regs
710 Thread *t = nonull_static_cast<Thread*>(ctxt);
712 if (t->continuation_test_and_restore())
714 ctxt->arch_load_vcpu_kern_state(vcpu, true);
715 t->fast_return_to_user(vcpu->_entry_ip, vcpu->_entry_sp,
716 t->vcpu_state().usr().get());
723 // ------------------------------------------------------------------------
724 IMPLEMENTATION [svm && debug]:
728 Vm_svm::Log_vm_svm_exit::print(int max, char *buf) const
730 return snprintf(buf, max, "ec=%lx ei1=%08lx ei2=%08lx rip=%08lx",
731 exitcode, exitinfo1, exitinfo2, rip);