]> rtime.felk.cvut.cz Git - can-eth-gw-linux.git/blobdiff - arch/x86/kvm/vmx.c
Merge tag 'kvm-3.8-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm
[can-eth-gw-linux.git] / arch / x86 / kvm / vmx.c
index f85815945fc6d62532c9735747759452cbe7c101..9120ae1901e42804828dbde5db41707c113b0cbc 100644 (file)
@@ -42,6 +42,7 @@
 #include <asm/i387.h>
 #include <asm/xcr.h>
 #include <asm/perf_event.h>
+#include <asm/kexec.h>
 
 #include "trace.h"
 
@@ -802,11 +803,6 @@ static inline bool cpu_has_vmx_ept_ad_bits(void)
        return vmx_capability.ept & VMX_EPT_AD_BIT;
 }
 
-static inline bool cpu_has_vmx_invept_individual_addr(void)
-{
-       return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
-}
-
 static inline bool cpu_has_vmx_invept_context(void)
 {
        return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT;
@@ -992,6 +988,46 @@ static void vmcs_load(struct vmcs *vmcs)
                       vmcs, phys_addr);
 }
 
+#ifdef CONFIG_KEXEC
+/*
+ * This bitmap is used to indicate whether the vmclear
+ * operation is enabled on all cpus. All disabled by
+ * default.
+ */
+static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
+
+static inline void crash_enable_local_vmclear(int cpu)
+{
+       cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
+}
+
+static inline void crash_disable_local_vmclear(int cpu)
+{
+       cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
+}
+
+static inline int crash_local_vmclear_enabled(int cpu)
+{
+       return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
+}
+
+static void crash_vmclear_local_loaded_vmcss(void)
+{
+       int cpu = raw_smp_processor_id();
+       struct loaded_vmcs *v;
+
+       if (!crash_local_vmclear_enabled(cpu))
+               return;
+
+       list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
+                           loaded_vmcss_on_cpu_link)
+               vmcs_clear(v->vmcs);
+}
+#else
+static inline void crash_enable_local_vmclear(int cpu) { }
+static inline void crash_disable_local_vmclear(int cpu) { }
+#endif /* CONFIG_KEXEC */
+
 static void __loaded_vmcs_clear(void *arg)
 {
        struct loaded_vmcs *loaded_vmcs = arg;
@@ -1001,15 +1037,28 @@ static void __loaded_vmcs_clear(void *arg)
                return; /* vcpu migration can race with cpu offline */
        if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
                per_cpu(current_vmcs, cpu) = NULL;
+       crash_disable_local_vmclear(cpu);
        list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
+
+       /*
+        * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
+        * is before setting loaded_vmcs->vcpu to -1 which is done in
+        * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
+        * then adds the vmcs into percpu list before it is deleted.
+        */
+       smp_wmb();
+
        loaded_vmcs_init(loaded_vmcs);
+       crash_enable_local_vmclear(cpu);
 }
 
 static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
 {
-       if (loaded_vmcs->cpu != -1)
-               smp_call_function_single(
-                       loaded_vmcs->cpu, __loaded_vmcs_clear, loaded_vmcs, 1);
+       int cpu = loaded_vmcs->cpu;
+
+       if (cpu != -1)
+               smp_call_function_single(cpu,
+                        __loaded_vmcs_clear, loaded_vmcs, 1);
 }
 
 static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
@@ -1051,17 +1100,6 @@ static inline void ept_sync_context(u64 eptp)
        }
 }
 
-static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
-{
-       if (enable_ept) {
-               if (cpu_has_vmx_invept_individual_addr())
-                       __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR,
-                                       eptp, gpa);
-               else
-                       ept_sync_context(eptp);
-       }
-}
-
 static __always_inline unsigned long vmcs_readl(unsigned long field)
 {
        unsigned long value;
@@ -1535,8 +1573,18 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
                kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
                local_irq_disable();
+               crash_disable_local_vmclear(cpu);
+
+               /*
+                * Read loaded_vmcs->cpu should be before fetching
+                * loaded_vmcs->loaded_vmcss_on_cpu_link.
+                * See the comments in __loaded_vmcs_clear().
+                */
+               smp_rmb();
+
                list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
                         &per_cpu(loaded_vmcss_on_cpu, cpu));
+               crash_enable_local_vmclear(cpu);
                local_irq_enable();
 
                /*
@@ -1839,11 +1887,10 @@ static u64 guest_read_tsc(void)
  * Like guest_read_tsc, but always returns L1's notion of the timestamp
  * counter, even if a nested guest (L2) is currently running.
  */
-u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu)
+u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
 {
-       u64 host_tsc, tsc_offset;
+       u64 tsc_offset;
 
-       rdtscll(host_tsc);
        tsc_offset = is_guest_mode(vcpu) ?
                to_vmx(vcpu)->nested.vmcs01_tsc_offset :
                vmcs_read64(TSC_OFFSET);
@@ -1866,6 +1913,11 @@ static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
                WARN(1, "user requested TSC rate below hardware speed\n");
 }
 
+static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu)
+{
+       return vmcs_read64(TSC_OFFSET);
+}
+
 /*
  * writes 'offset' into guest's timestamp counter offset register
  */
@@ -2202,15 +2254,17 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
  * Returns 0 on success, non-0 otherwise.
  * Assumes vcpu_load() was already called.
  */
-static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
+static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
        struct vcpu_vmx *vmx = to_vmx(vcpu);
        struct shared_msr_entry *msr;
        int ret = 0;
+       u32 msr_index = msr_info->index;
+       u64 data = msr_info->data;
 
        switch (msr_index) {
        case MSR_EFER:
-               ret = kvm_set_msr_common(vcpu, msr_index, data);
+               ret = kvm_set_msr_common(vcpu, msr_info);
                break;
 #ifdef CONFIG_X86_64
        case MSR_FS_BASE:
@@ -2236,7 +2290,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
                vmcs_writel(GUEST_SYSENTER_ESP, data);
                break;
        case MSR_IA32_TSC:
-               kvm_write_tsc(vcpu, data);
+               kvm_write_tsc(vcpu, msr_info);
                break;
        case MSR_IA32_CR_PAT:
                if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
@@ -2244,7 +2298,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
                        vcpu->arch.pat = data;
                        break;
                }
-               ret = kvm_set_msr_common(vcpu, msr_index, data);
+               ret = kvm_set_msr_common(vcpu, msr_info);
+               break;
+       case MSR_IA32_TSC_ADJUST:
+               ret = kvm_set_msr_common(vcpu, msr_info);
                break;
        case MSR_TSC_AUX:
                if (!vmx->rdtscp_enabled)
@@ -2267,7 +2324,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
                        }
                        break;
                }
-               ret = kvm_set_msr_common(vcpu, msr_index, data);
+               ret = kvm_set_msr_common(vcpu, msr_info);
        }
 
        return ret;
@@ -2341,6 +2398,18 @@ static int hardware_enable(void *garbage)
                return -EBUSY;
 
        INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
+
+       /*
+        * Now we can enable the vmclear operation in kdump
+        * since the loaded_vmcss_on_cpu list on this cpu
+        * has been initialized.
+        *
+        * Though the cpu is not in VMX operation now, there
+        * is no problem to enable the vmclear operation
+        * for the loaded_vmcss_on_cpu list is empty!
+        */
+       crash_enable_local_vmclear(cpu);
+
        rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
 
        test_bits = FEATURE_CONTROL_LOCKED;
@@ -2697,6 +2766,7 @@ static void fix_pmode_dataseg(struct kvm_vcpu *vcpu, int seg, struct kvm_segment
        if (!(vmcs_readl(sf->base) == tmp.base && tmp.s)) {
                tmp.base = vmcs_readl(sf->base);
                tmp.selector = vmcs_read16(sf->selector);
+               tmp.dpl = tmp.selector & SELECTOR_RPL_MASK;
                tmp.s = 1;
        }
        vmx_set_segment(vcpu, &tmp, seg);
@@ -3246,7 +3316,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
         * unrestricted guest like Westmere to older host that don't have
         * unrestricted guest like Nehelem.
         */
-       if (!enable_unrestricted_guest && vmx->rmode.vm86_active) {
+       if (vmx->rmode.vm86_active) {
                switch (seg) {
                case VCPU_SREG_CS:
                        vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
@@ -3897,8 +3967,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
        vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
        set_cr4_guest_host_mask(vmx);
 
-       kvm_write_tsc(&vmx->vcpu, 0);
-
        return 0;
 }
 
@@ -3908,8 +3976,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
        u64 msr;
        int ret;
 
-       vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
-
        vmx->rmode.vm86_active = 0;
 
        vmx->soft_vnmi_blocked = 0;
@@ -3921,10 +3987,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
                msr |= MSR_IA32_APICBASE_BSP;
        kvm_set_apic_base(&vmx->vcpu, msr);
 
-       ret = fx_init(&vmx->vcpu);
-       if (ret != 0)
-               goto out;
-
        vmx_segment_cache_clear(vmx);
 
        seg_setup(VCPU_SREG_CS);
@@ -3965,7 +4027,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
                kvm_rip_write(vcpu, 0xfff0);
        else
                kvm_rip_write(vcpu, 0);
-       kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
 
        vmcs_writel(GUEST_GDTR_BASE, 0);
        vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
@@ -4015,7 +4076,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
        /* HACK: Don't enable emulation on guest boot/reset */
        vmx->emulation_required = 0;
 
-out:
        return ret;
 }
 
@@ -4287,16 +4347,6 @@ static int handle_exception(struct kvm_vcpu *vcpu)
        if (is_machine_check(intr_info))
                return handle_machine_check(vcpu);
 
-       if ((vect_info & VECTORING_INFO_VALID_MASK) &&
-           !is_page_fault(intr_info)) {
-               vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-               vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
-               vcpu->run->internal.ndata = 2;
-               vcpu->run->internal.data[0] = vect_info;
-               vcpu->run->internal.data[1] = intr_info;
-               return 0;
-       }
-
        if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
                return 1;  /* already handled by vmx_vcpu_run() */
 
@@ -4315,6 +4365,22 @@ static int handle_exception(struct kvm_vcpu *vcpu)
        error_code = 0;
        if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
                error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+
+       /*
+        * The #PF with PFEC.RSVD = 1 indicates the guest is accessing
+        * MMIO, it is better to report an internal error.
+        * See the comments in vmx_handle_exit.
+        */
+       if ((vect_info & VECTORING_INFO_VALID_MASK) &&
+           !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
+               vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+               vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
+               vcpu->run->internal.ndata = 2;
+               vcpu->run->internal.data[0] = vect_info;
+               vcpu->run->internal.data[1] = intr_info;
+               return 0;
+       }
+
        if (is_page_fault(intr_info)) {
                /* EPT won't cause page fault directly */
                BUG_ON(enable_ept);
@@ -4626,11 +4692,15 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu)
 
 static int handle_wrmsr(struct kvm_vcpu *vcpu)
 {
+       struct msr_data msr;
        u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
        u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
                | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
 
-       if (vmx_set_msr(vcpu, ecx, data) != 0) {
+       msr.data = data;
+       msr.index = ecx;
+       msr.host_initiated = false;
+       if (vmx_set_msr(vcpu, &msr) != 0) {
                trace_kvm_msr_write_ex(ecx, data);
                kvm_inject_gp(vcpu, 0);
                return 1;
@@ -4827,11 +4897,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
 
        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 
-       if (exit_qualification & (1 << 6)) {
-               printk(KERN_ERR "EPT: GPA exceeds GAW!\n");
-               return -EINVAL;
-       }
-
        gla_validity = (exit_qualification >> 7) & 0x3;
        if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) {
                printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
@@ -5979,13 +6044,24 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
                return 0;
        }
 
+       /*
+        * Note:
+        * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
+        * delivery event since it indicates guest is accessing MMIO.
+        * The vm-exit can be triggered again after return to guest that
+        * will cause infinite loop.
+        */
        if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
                        (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
                        exit_reason != EXIT_REASON_EPT_VIOLATION &&
-                       exit_reason != EXIT_REASON_TASK_SWITCH))
-               printk(KERN_WARNING "%s: unexpected, valid vectoring info "
-                      "(0x%x) and exit reason is 0x%x\n",
-                      __func__, vectoring_info, exit_reason);
+                       exit_reason != EXIT_REASON_TASK_SWITCH)) {
+               vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+               vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
+               vcpu->run->internal.ndata = 2;
+               vcpu->run->internal.data[0] = vectoring_info;
+               vcpu->run->internal.data[1] = exit_reason;
+               return 0;
+       }
 
        if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked &&
            !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis(
@@ -7309,6 +7385,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
        .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
 
        .set_tsc_khz = vmx_set_tsc_khz,
+       .read_tsc_offset = vmx_read_tsc_offset,
        .write_tsc_offset = vmx_write_tsc_offset,
        .adjust_tsc_offset = vmx_adjust_tsc_offset,
        .compute_tsc_offset = vmx_compute_tsc_offset,
@@ -7367,6 +7444,11 @@ static int __init vmx_init(void)
        if (r)
                goto out3;
 
+#ifdef CONFIG_KEXEC
+       rcu_assign_pointer(crash_vmclear_loaded_vmcss,
+                          crash_vmclear_local_loaded_vmcss);
+#endif
+
        vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
        vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
        vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
@@ -7404,6 +7486,11 @@ static void __exit vmx_exit(void)
        free_page((unsigned long)vmx_io_bitmap_b);
        free_page((unsigned long)vmx_io_bitmap_a);
 
+#ifdef CONFIG_KEXEC
+       rcu_assign_pointer(crash_vmclear_loaded_vmcss, NULL);
+       synchronize_rcu();
+#endif
+
        kvm_exit();
 }