4 #include "per_cpu_data.h"
10 class Vm_vmx : public Vm
13 static unsigned long resume_vm_vmx(Vcpu_state *regs)
14 asm("resume_vm_vmx") __attribute__((__regparm__(3)));
24 //----------------------------------------------------------------------------
28 #include "mem_space.h"
31 #include "thread.h" // XXX: circular dep, move this out here!
32 #include "thread_state.h" // XXX: circular dep, move this out here!
38 Vm_vmx::Vm_vmx(Ram_quota *q)
44 Vm_vmx::operator new (size_t size, void *p) throw()
47 assert (size == sizeof (Vm_vmx));
53 Vm_vmx::operator delete (void *ptr)
55 Vm_vmx *t = reinterpret_cast<Vm_vmx*>(ptr);
56 allocator<Vm_vmx>()->q_free(t->ram_quota(), ptr);
64 Vm_vmx::field_offset(void *vmcs, unsigned field)
66 return (void *)((char *)vmcs
67 + ((field >> 13) * 4 + ((field >> 10) & 3) + 1) * 0x80);
72 Vm_vmx::field_width(unsigned field)
74 static const char widths[4] = { 2, 8, 4, sizeof(Mword) };
75 return widths[field >> 13];
82 Vm_vmx::load(unsigned field, void *vmcs, Vmx_info::Bit_defs<T> const &m)
84 T res = m.apply(read<T>(vmcs, field));
85 Vmx::vmwrite(field, res);
86 return Vmx_info::Flags<T>(res);
91 Vm_vmx::load(unsigned field_first, unsigned field_last, void *vmcs)
93 for (; field_first <= field_last; field_first += 2)
94 load(field_first, vmcs);
98 template< typename T >
100 Vm_vmx::_internal_read(void *vmcs, unsigned field)
102 vmcs = field_offset(vmcs, field);
103 return *((T *)vmcs + ((field >> 1) & 0xff));
106 PRIVATE inline static
107 template< typename T >
109 Vm_vmx::_internal_write(void *vmcs, unsigned field, T value)
111 vmcs = field_offset(vmcs, field);
112 *((T*)vmcs + ((field >> 1) & 0xff)) = value;
117 Vm_vmx::load(unsigned field, void *vmcs)
121 case 0: Vmx::vmwrite(field, _internal_read<Unsigned16>(vmcs, field)); break;
122 case 1: Vmx::vmwrite(field, _internal_read<Unsigned64>(vmcs, field)); break;
123 case 2: Vmx::vmwrite(field, _internal_read<Unsigned32>(vmcs, field)); break;
124 case 3: Vmx::vmwrite(field, _internal_read<Mword>(vmcs, field)); break;
130 Vm_vmx::store(unsigned field, void *vmcs)
134 case 0: _internal_write(vmcs, field, Vmx::vmread<Unsigned16>(field)); break;
135 case 1: _internal_write(vmcs, field, Vmx::vmread<Unsigned64>(field)); break;
136 case 2: _internal_write(vmcs, field, Vmx::vmread<Unsigned32>(field)); break;
137 case 3: _internal_write(vmcs, field, Vmx::vmread<Mword>(field)); break;
143 Vm_vmx::store(unsigned field_first, unsigned field_last, void *vmcs)
145 for (; field_first <= field_last; field_first += 2)
146 store(field_first, vmcs);
149 PRIVATE inline static
150 template< typename T >
152 Vm_vmx::write(void *vmcs, unsigned field, T value)
156 case 0: _internal_write(vmcs, field, (Unsigned16)value); break;
157 case 1: _internal_write(vmcs, field, (Unsigned64)value); break;
158 case 2: _internal_write(vmcs, field, (Unsigned32)value); break;
159 case 3: _internal_write(vmcs, field, (Mword)value); break;
163 PRIVATE inline static
164 template< typename T >
166 Vm_vmx::read(void *vmcs, unsigned field)
170 case 0: return _internal_read<Unsigned16>(vmcs, field);
171 case 1: return _internal_read<Unsigned64>(vmcs, field);
172 case 2: return _internal_read<Unsigned32>(vmcs, field);
173 case 3: return _internal_read<Mword>(vmcs, field);
181 Vm_vmx::load_guest_state(unsigned cpu, void *src)
183 Vmx &vmx = Vmx::cpus.cpu(cpu);
185 // read VM-entry controls, apply filter and keep for later
186 Vmx_info::Flags<Unsigned32> entry_ctls
187 = load<Unsigned32>(Vmx::F_entry_ctls, src, vmx.info.entry_ctls);
189 Vmx_info::Flags<Unsigned32> pinbased_ctls
190 = load<Unsigned32>(Vmx::F_pin_based_ctls, src, vmx.info.pinbased_ctls);
192 Vmx_info::Flags<Unsigned32> procbased_ctls
193 = load<Unsigned32>(Vmx::F_proc_based_ctls, src, vmx.info.procbased_ctls);
195 Vmx_info::Flags<Unsigned32> procbased_ctls_2;
196 if (procbased_ctls.test(Vmx::PRB1_enable_proc_based_ctls_2))
197 procbased_ctls_2 = load<Unsigned32>(Vmx::F_proc_based_ctls_2, src, vmx.info.procbased_ctls2);
199 procbased_ctls_2 = Vmx_info::Flags<Unsigned32>(0);
201 load<Unsigned32>(Vmx::F_exit_ctls, src, vmx.info.exit_ctls);
203 // write 16-bit fields
204 load(0x800, 0x80e, src);
206 // write 64-bit fields
209 // check if the following bits are allowed to be set in entry_ctls
210 if (entry_ctls.test(14)) // PAT load requested
213 if (entry_ctls.test(15)) // EFER load requested
216 if (entry_ctls.test(13)) // IA32_PERF_GLOBAL_CTRL load requested
219 // this is Fiasco.OC internal state
222 load(0x280a, 0x2810, src);
225 // write 32-bit fields
226 load(0x4800, 0x482a, src);
228 if (pinbased_ctls.test(6)) // activate vmx-preemption timer
231 // write natural-width fields
232 load<Mword>(0x6800, src, vmx.info.cr0_defs);
234 if (sizeof(long) > sizeof(int))
236 if (read<Mword>(src, 0x2806) & EFER_LME)
237 Vmx::vmwrite(0x6802, (Mword)phys_dir());
239 WARN("VMX: No, not possible\n");
243 // for 32bit we can just load the Vm pdbr
244 Vmx::vmwrite(0x6802, (Mword)phys_dir());
247 load<Mword>(0x6804, src, vmx.info.cr4_defs);
248 load(0x6806, 0x6826, src);
250 // VPID must be virtualized in Fiasco
252 if (procbased_ctls_2 & Vmx::PB2_enable_vpid)
253 load(Vmx::F_vpid, src);
256 // currently io-bitmaps are unsupported
257 // currently msr-bitmaps are unsupported
259 // load(0x200C, src); for SMM virtualization
260 load(Vmx::F_tsc_offset, src);
262 // no virtual APIC yet, and has to be managed in kernel somehow
264 if (procbased_ctls.test(Vmx::PRB1_tpr_shadow))
268 if (procbased_ctls_2.test(Vmx::PRB2_virtualize_apic))
269 load(Vmx::F_apic_access_addr, src);
271 // exception bit map and pf error-code stuff
272 load(0x4004, 0x4008, src);
274 // vm entry control stuff
275 Unsigned32 irq_info = read<Unsigned32>(src, Vmx::F_entry_int_info);
276 if (irq_info & (1UL << 31))
278 // do event injection
280 // load error code, if required
281 if (irq_info & (1UL << 11))
282 load(Vmx::F_entry_exc_error_code, src);
284 // types, that require an insn length have bit 10 set (type 4, 5, and 6)
285 if (irq_info & (1UL << 10))
286 load(Vmx::F_entry_insn_len, src);
288 Vmx::vmwrite(Vmx::F_entry_int_info, irq_info);
291 // hm, we have to check for sanitizing the cr0 and cr4 shadow stuff
292 load(0x6000, 0x6006, src);
294 // no cr3 target values supported
300 Vm_vmx::store_guest_state(unsigned cpu, void *dest)
302 // read 16-bit fields
303 store(0x800, 0x80e, dest);
305 // read 64-bit fields
308 Vmx_info &vmx_info = Vmx::cpus.cpu(cpu).info;
309 Vmx_info::Flags<Unsigned32> exit_ctls
310 = Vmx_info::Flags<Unsigned32>(vmx_info.exit_ctls.apply(read<Unsigned32>(dest, Vmx::F_exit_ctls)));
312 if (exit_ctls.test(18)) store(Vmx::F_guest_pat, dest);
313 if (exit_ctls.test(20)) store(Vmx::F_guest_efer, dest);
314 if (exit_ctls.test(22)) store(Vmx::F_preempt_timer, dest);
316 // EPT and PAE handling missing
318 if (Vmx::cpus.cpu(cpu).has_ept())
319 store(0x280a, 0x2810, dest);
322 // read 32-bit fields
323 store(0x4800, 0x4826, dest);
325 // sysenter msr is not saved here, because we trap all msr accesses right now
329 store(0x6824, 0x6826, dest);
332 // read natural-width fields
335 store(0x6804, 0x6822, dest);
340 Vm_vmx::store_exit_info(unsigned cpu, void *dest)
343 // read 64-bit fields, that is a EPT pf thing
345 if (Vmx::cpus.cpu(cpu).has_ept())
349 // clear the valid bit in Vm-entry interruption information
351 Unsigned32 tmp = read<Unsigned32>(dest, Vmx::F_entry_int_info);
352 if (tmp & (1UL << 31))
353 write(dest, Vmx::F_entry_int_info, tmp & ~((Unsigned32)1 << 31));
356 // read 32-bit fields
357 store(0x4400, 0x440e, dest);
359 // read natural-width fields
360 store(0x6400, 0x640a, dest);
365 Vm_vmx::dump(void *v, unsigned f, unsigned t)
367 for (; f <= t; f += 2)
368 printf("%04x: VMCS: %16lx V: %16lx\n",
369 f, Vmx::vmread<Mword>(f), read<Mword>(v, f));
374 Vm_vmx::dump_state(void *v)
376 dump(v, 0x0800, 0x080e);
377 dump(v, 0x0c00, 0x0c0c);
378 dump(v, 0x2000, 0x201a);
379 dump(v, 0x2800, 0x2810);
380 dump(v, 0x2c00, 0x2804);
381 dump(v, 0x4000, 0x4022);
382 dump(v, 0x4400, 0x4420);
383 dump(v, 0x4800, 0x482a);
384 dump(v, 0x6800, 0x6826);
385 dump(v, 0x6c00, 0x6c16);
388 PRIVATE inline NOEXPORT
390 Vm_vmx::do_resume_vcpu(Context *ctxt, Vcpu_state *vcpu, void *vmcs_s)
392 assert (cpu_lock.test());
394 /* these 4 must not use ldt entries */
395 assert (!(Cpu::get_cs() & (1 << 2)));
396 assert (!(Cpu::get_ss() & (1 << 2)));
397 assert (!(Cpu::get_ds() & (1 << 2)));
398 assert (!(Cpu::get_es() & (1 << 2)));
400 unsigned cpu = current_cpu();
401 Vmx &v = Vmx::cpus.cpu(cpu);
403 if (!v.vmx_enabled())
405 WARNX(Info, "VMX: not supported/enabled\n");
406 return -L4_err::ENodev;
410 // This generates a circular dep between thread<->task, this cries for a
411 // new abstraction...
412 if (!(ctxt->state() & Thread_fpu_owner))
414 if (EXPECT_FALSE(!static_cast<Thread*>(ctxt)->switchin_fpu()))
416 WARN("VMX: switchin_fpu failed\n");
417 return -L4_err::EInval;
422 if (EXPECT_FALSE(read<Unsigned32>(vmcs_s, 0x201a) != 0)) // EPT POINTER
424 WARN("VMX: no nested paging available\n");
425 return commit_result(-L4_err::EInval);
429 // increment our refcount, and drop it at the end automatically
430 Ref_ptr<Vm_vmx> pin_myself(this);
432 // set volatile host state
433 Vmx::vmwrite<Mword>(Vmx::F_host_cr3, Cpu::get_pdbr()); // host_area.cr3
435 load_guest_state(cpu, vmcs_s);
437 Unsigned16 ldt = Cpu::get_ldt();
440 asm volatile("mov %0, %%cr2" : : "r" (read<Mword>(vmcs_s, Vmx::F_guest_cr2)));
442 unsigned long ret = resume_vm_vmx(vcpu);
444 if (EXPECT_FALSE(ret & 0x40))
445 return -L4_err::EInval;
450 asm volatile("mov %%cr2, %0" : "=r" (cpu_cr2));
451 write(vmcs_s, Vmx::F_guest_cr2, cpu_cr2);
456 // reload TSS, we use I/O bitmaps
457 // ... do this lazy ...
460 Gdt_entry *e = &(*Cpu::cpus.cpu(cpu).get_gdt())[Gdt::gdt_tss / 8];
461 e->access &= ~(1 << 1);
462 asm volatile("" : : "m" (*e));
463 Cpu::set_tr(Gdt::gdt_tss);
466 store_guest_state(cpu, vmcs_s);
467 store_exit_info(cpu, vmcs_s);
469 if ((read<Unsigned32>(vmcs_s, Vmx::F_exit_reason) & 0xffff) == 1)
472 vcpu->state &= ~(Vcpu_state::F_traps | Vcpu_state::F_user_mode);
478 Vm_vmx::resume_vcpu(Context *ctxt, Vcpu_state *vcpu, bool user_mode)
481 assert_kdb (user_mode);
483 if (EXPECT_FALSE(!(ctxt->state(true) & Thread_ext_vcpu_enabled)))
484 return -L4_err::EInval;
486 void *vmcs_s = reinterpret_cast<char *>(vcpu) + 0x400;
490 // in the case of disabled IRQs and a pending IRQ directly simulate an
491 // external interrupt intercept
492 if ( !(vcpu->_saved_state & Vcpu_state::F_irqs)
493 && (vcpu->sticky_flags & Vcpu_state::Sf_irq_pending))
495 // XXX: check if this is correct, we set external irq exit as reason
496 write<Unsigned32>(vmcs_s, Vmx::F_exit_reason, 1);
497 return 1; // return 1 to indicate pending IRQs (IPCs)
500 int r = do_resume_vcpu(ctxt, vcpu, vmcs_s);
502 // test for error or non-IRQ exit reason
506 // check for IRQ exits and allow to handle the IRQ
508 Proc::preemption_point();
510 // Check if the current context got a message delivered.
511 // This is done by testing for a valid continuation.
512 // When a continuation is set we have to directly
513 // leave the kernel to not overwrite the vcpu-regs
515 Thread *t = nonull_static_cast<Thread*>(ctxt);
516 if (t->continuation_test_and_restore())
517 t->fast_return_to_user(vcpu->_entry_ip, vcpu->_entry_sp,
518 t->vcpu_state().usr().get());