#include <linux/fs.h>
#include <linux/vmalloc.h>
#include <linux/module.h>
+#include <linux/mman.h>
#include <asm/uaccess.h>
#include <asm/msr.h>
{ "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
{ "mmu_flooded", VM_STAT(mmu_flooded) },
{ "mmu_recycled", VM_STAT(mmu_recycled) },
+ { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
{ NULL }
};
return ret;
}
+static bool pdptrs_changed(struct kvm_vcpu *vcpu)
+{
+ u64 pdpte[ARRAY_SIZE(vcpu->pdptrs)];
+ bool changed = true;
+ int r;
+
+ if (is_long_mode(vcpu) || !is_pae(vcpu))
+ return false;
+
+ mutex_lock(&vcpu->kvm->lock);
+ r = kvm_read_guest(vcpu->kvm, vcpu->cr3 & ~31u, pdpte, sizeof(pdpte));
+ if (r < 0)
+ goto out;
+ changed = memcmp(pdpte, vcpu->pdptrs, sizeof(pdpte)) != 0;
+out:
+ mutex_unlock(&vcpu->kvm->lock);
+
+ return changed;
+}
+
void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
if (cr0 & CR0_RESERVED_BITS) {
void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
+ if (cr3 == vcpu->cr3 && !pdptrs_changed(vcpu)) {
+ kvm_mmu_flush_tlb(vcpu);
+ return;
+ }
+
if (is_long_mode(vcpu)) {
if (cr3 & CR3_L_MODE_RESERVED_BITS) {
printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
case KVM_CAP_USER_MEMORY:
case KVM_CAP_SET_TSS_ADDR:
+ case KVM_CAP_EXT_CPUID:
r = 1;
break;
default:
kvm_put_guest_fpu(vcpu);
}
-static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
+static int is_efer_nx(void)
{
u64 efer;
- int i;
- struct kvm_cpuid_entry *e, *entry;
rdmsrl(MSR_EFER, efer);
+ return efer & EFER_NX;
+}
+
+static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
+{
+ int i;
+ struct kvm_cpuid_entry2 *e, *entry;
+
entry = NULL;
for (i = 0; i < vcpu->cpuid_nent; ++i) {
e = &vcpu->cpuid_entries[i];
break;
}
}
- if (entry && (entry->edx & (1 << 20)) && !(efer & EFER_NX)) {
+ if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
entry->edx &= ~(1 << 20);
printk(KERN_INFO "kvm: guest NX capability removed\n");
}
}
+/* when an old userspace process fills a new kernel module */
static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
struct kvm_cpuid *cpuid,
struct kvm_cpuid_entry __user *entries)
+{
+ int r, i;
+ struct kvm_cpuid_entry *cpuid_entries;
+
+ r = -E2BIG;
+ if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+ goto out;
+ r = -ENOMEM;
+ cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
+ if (!cpuid_entries)
+ goto out;
+ r = -EFAULT;
+ if (copy_from_user(cpuid_entries, entries,
+ cpuid->nent * sizeof(struct kvm_cpuid_entry)))
+ goto out_free;
+ for (i = 0; i < cpuid->nent; i++) {
+ vcpu->cpuid_entries[i].function = cpuid_entries[i].function;
+ vcpu->cpuid_entries[i].eax = cpuid_entries[i].eax;
+ vcpu->cpuid_entries[i].ebx = cpuid_entries[i].ebx;
+ vcpu->cpuid_entries[i].ecx = cpuid_entries[i].ecx;
+ vcpu->cpuid_entries[i].edx = cpuid_entries[i].edx;
+ vcpu->cpuid_entries[i].index = 0;
+ vcpu->cpuid_entries[i].flags = 0;
+ vcpu->cpuid_entries[i].padding[0] = 0;
+ vcpu->cpuid_entries[i].padding[1] = 0;
+ vcpu->cpuid_entries[i].padding[2] = 0;
+ }
+ vcpu->cpuid_nent = cpuid->nent;
+ cpuid_fix_nx_cap(vcpu);
+ r = 0;
+
+out_free:
+ vfree(cpuid_entries);
+out:
+ return r;
+}
+
+static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries)
{
int r;
goto out;
r = -EFAULT;
if (copy_from_user(&vcpu->cpuid_entries, entries,
- cpuid->nent * sizeof(struct kvm_cpuid_entry)))
+ cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
goto out;
vcpu->cpuid_nent = cpuid->nent;
- cpuid_fix_nx_cap(vcpu);
return 0;
out:
return r;
}
+static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries)
+{
+ int r;
+
+ r = -E2BIG;
+ if (cpuid->nent < vcpu->cpuid_nent)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(entries, &vcpu->cpuid_entries,
+ vcpu->cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
+ goto out;
+ return 0;
+
+out:
+ cpuid->nent = vcpu->cpuid_nent;
+ return r;
+}
+
+static inline u32 bit(int bitno)
+{
+ return 1 << (bitno & 31);
+}
+
+static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
+ u32 index)
+{
+ entry->function = function;
+ entry->index = index;
+ cpuid_count(entry->function, entry->index,
+ &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
+ entry->flags = 0;
+}
+
+static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
+ u32 index, int *nent, int maxnent)
+{
+ const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) |
+ bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
+ bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
+ bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
+ bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
+ bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) |
+ bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
+ bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) |
+ bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) |
+ bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP);
+ const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) |
+ bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
+ bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
+ bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
+ bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
+ bit(X86_FEATURE_PGE) |
+ bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
+ bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) |
+ bit(X86_FEATURE_SYSCALL) |
+ (bit(X86_FEATURE_NX) && is_efer_nx()) |
+#ifdef CONFIG_X86_64
+ bit(X86_FEATURE_LM) |
+#endif
+ bit(X86_FEATURE_MMXEXT) |
+ bit(X86_FEATURE_3DNOWEXT) |
+ bit(X86_FEATURE_3DNOW);
+ const u32 kvm_supported_word3_x86_features =
+ bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16);
+ const u32 kvm_supported_word6_x86_features =
+ bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY);
+
+ /* all func 2 cpuid_count() should be called on the same cpu */
+ get_cpu();
+ do_cpuid_1_ent(entry, function, index);
+ ++*nent;
+
+ switch (function) {
+ case 0:
+ entry->eax = min(entry->eax, (u32)0xb);
+ break;
+ case 1:
+ entry->edx &= kvm_supported_word0_x86_features;
+ entry->ecx &= kvm_supported_word3_x86_features;
+ break;
+ /* function 2 entries are STATEFUL. That is, repeated cpuid commands
+ * may return different values. This forces us to get_cpu() before
+ * issuing the first command, and also to emulate this annoying behavior
+ * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
+ case 2: {
+ int t, times = entry->eax & 0xff;
+
+ entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
+ for (t = 1; t < times && *nent < maxnent; ++t) {
+ do_cpuid_1_ent(&entry[t], function, 0);
+ entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
+ ++*nent;
+ }
+ break;
+ }
+ /* function 4 and 0xb have additional index. */
+ case 4: {
+ int index, cache_type;
+
+ entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ /* read more entries until cache_type is zero */
+ for (index = 1; *nent < maxnent; ++index) {
+ cache_type = entry[index - 1].eax & 0x1f;
+ if (!cache_type)
+ break;
+ do_cpuid_1_ent(&entry[index], function, index);
+ entry[index].flags |=
+ KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ ++*nent;
+ }
+ break;
+ }
+ case 0xb: {
+ int index, level_type;
+
+ entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ /* read more entries until level_type is zero */
+ for (index = 1; *nent < maxnent; ++index) {
+ level_type = entry[index - 1].ecx & 0xff;
+ if (!level_type)
+ break;
+ do_cpuid_1_ent(&entry[index], function, index);
+ entry[index].flags |=
+ KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ ++*nent;
+ }
+ break;
+ }
+ case 0x80000000:
+ entry->eax = min(entry->eax, 0x8000001a);
+ break;
+ case 0x80000001:
+ entry->edx &= kvm_supported_word1_x86_features;
+ entry->ecx &= kvm_supported_word6_x86_features;
+ break;
+ }
+ put_cpu();
+}
+
+static int kvm_vm_ioctl_get_supported_cpuid(struct kvm *kvm,
+ struct kvm_cpuid2 *cpuid,
+ struct kvm_cpuid_entry2 __user *entries)
+{
+ struct kvm_cpuid_entry2 *cpuid_entries;
+ int limit, nent = 0, r = -E2BIG;
+ u32 func;
+
+ if (cpuid->nent < 1)
+ goto out;
+ r = -ENOMEM;
+ cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
+ if (!cpuid_entries)
+ goto out;
+
+ do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
+ limit = cpuid_entries[0].eax;
+ for (func = 1; func <= limit && nent < cpuid->nent; ++func)
+ do_cpuid_ent(&cpuid_entries[nent], func, 0,
+ &nent, cpuid->nent);
+ r = -E2BIG;
+ if (nent >= cpuid->nent)
+ goto out_free;
+
+ do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
+ limit = cpuid_entries[nent - 1].eax;
+ for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
+ do_cpuid_ent(&cpuid_entries[nent], func, 0,
+ &nent, cpuid->nent);
+ r = -EFAULT;
+ if (copy_to_user(entries, cpuid_entries,
+ nent * sizeof(struct kvm_cpuid_entry2)))
+ goto out_free;
+ cpuid->nent = nent;
+ r = 0;
+
+out_free:
+ vfree(cpuid_entries);
+out:
+ return r;
+}
+
static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
{
goto out;
break;
}
+ case KVM_SET_CPUID2: {
+ struct kvm_cpuid2 __user *cpuid_arg = argp;
+ struct kvm_cpuid2 cpuid;
+
+ r = -EFAULT;
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+ goto out;
+ r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
+ cpuid_arg->entries);
+ if (r)
+ goto out;
+ break;
+ }
+ case KVM_GET_CPUID2: {
+ struct kvm_cpuid2 __user *cpuid_arg = argp;
+ struct kvm_cpuid2 cpuid;
+
+ r = -EFAULT;
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+ goto out;
+ r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
+ cpuid_arg->entries);
+ if (r)
+ goto out;
+ r = -EFAULT;
+ if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
+ goto out;
+ r = 0;
+ break;
+ }
case KVM_GET_MSRS:
r = msr_io(vcpu, argp, kvm_get_msr, 1);
break;
r = 0;
break;
}
+ case KVM_GET_SUPPORTED_CPUID: {
+ struct kvm_cpuid2 __user *cpuid_arg = argp;
+ struct kvm_cpuid2 cpuid;
+
+ r = -EFAULT;
+ if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+ goto out;
+ r = kvm_vm_ioctl_get_supported_cpuid(kvm, &cpuid,
+ cpuid_arg->entries);
+ if (r)
+ goto out;
+
+ r = -EFAULT;
+ if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
+ goto out;
+ r = 0;
+ break;
+ }
default:
;
}
}
EXPORT_SYMBOL_GPL(emulator_read_std);
-static int emulator_write_std(unsigned long addr,
- const void *val,
- unsigned int bytes,
- struct kvm_vcpu *vcpu)
-{
- pr_unimpl(vcpu, "emulator_write_std: addr %lx n %d\n", addr, bytes);
- return X86EMUL_UNHANDLEABLE;
-}
-
static int emulator_read_emulated(unsigned long addr,
void *val,
unsigned int bytes,
struct x86_emulate_ops emulate_ops = {
.read_std = emulator_read_std,
- .write_std = emulator_write_std,
.read_emulated = emulator_read_emulated,
.write_emulated = emulator_write_emulated,
.cmpxchg_emulated = emulator_cmpxchg_emulated,
for (i = 0; i < ARRAY_SIZE(vcpu->pio.guest_pages); ++i)
if (vcpu->pio.guest_pages[i]) {
- kvm_release_page(vcpu->pio.guest_pages[i]);
+ kvm_release_page_dirty(vcpu->pio.guest_pages[i]);
vcpu->pio.guest_pages[i] = NULL;
}
}
int kvm_arch_init(void *opaque)
{
+ int r;
struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
+ r = kvm_mmu_module_init();
+ if (r)
+ goto out_fail;
+
kvm_init_msr_list();
if (kvm_x86_ops) {
printk(KERN_ERR "kvm: already loaded the other module\n");
- return -EEXIST;
+ r = -EEXIST;
+ goto out;
}
if (!ops->cpu_has_kvm_support()) {
printk(KERN_ERR "kvm: no hardware support\n");
- return -EOPNOTSUPP;
+ r = -EOPNOTSUPP;
+ goto out;
}
if (ops->disabled_by_bios()) {
printk(KERN_ERR "kvm: disabled by bios\n");
- return -EOPNOTSUPP;
+ r = -EOPNOTSUPP;
+ goto out;
}
kvm_x86_ops = ops;
-
+ kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
return 0;
+
+out:
+ kvm_mmu_module_exit();
+out_fail:
+ return r;
}
void kvm_arch_exit(void)
{
kvm_x86_ops = NULL;
- }
+ kvm_mmu_module_exit();
+}
int kvm_emulate_halt(struct kvm_vcpu *vcpu)
{
}
}
+static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
+{
+ struct kvm_cpuid_entry2 *e = &vcpu->cpuid_entries[i];
+ int j, nent = vcpu->cpuid_nent;
+
+ e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
+ /* when no next entry is found, the current entry[i] is reselected */
+ for (j = i + 1; j == i; j = (j + 1) % nent) {
+ struct kvm_cpuid_entry2 *ej = &vcpu->cpuid_entries[j];
+ if (ej->function == e->function) {
+ ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
+ return j;
+ }
+ }
+ return 0; /* silence gcc, even though control never reaches here */
+}
+
+/* find an entry with matching function, matching index (if needed), and that
+ * should be read next (if it's stateful) */
+static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
+ u32 function, u32 index)
+{
+ if (e->function != function)
+ return 0;
+ if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
+ return 0;
+ if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
+ !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
+ return 0;
+ return 1;
+}
+
void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
{
int i;
- u32 function;
- struct kvm_cpuid_entry *e, *best;
+ u32 function, index;
+ struct kvm_cpuid_entry2 *e, *best;
kvm_x86_ops->cache_regs(vcpu);
function = vcpu->regs[VCPU_REGS_RAX];
+ index = vcpu->regs[VCPU_REGS_RCX];
vcpu->regs[VCPU_REGS_RAX] = 0;
vcpu->regs[VCPU_REGS_RBX] = 0;
vcpu->regs[VCPU_REGS_RCX] = 0;
best = NULL;
for (i = 0; i < vcpu->cpuid_nent; ++i) {
e = &vcpu->cpuid_entries[i];
- if (e->function == function) {
+ if (is_matching_cpuid_entry(e, function, index)) {
+ if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
+ move_to_next_stateful_cpuid_entry(vcpu, i);
best = e;
break;
}
struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
unsigned int id)
{
- int r;
- struct kvm_vcpu *vcpu = kvm_x86_ops->vcpu_create(kvm, id);
+ return kvm_x86_ops->vcpu_create(kvm, id);
+}
- if (IS_ERR(vcpu)) {
- r = -ENOMEM;
- goto fail;
- }
+int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
+{
+ int r;
/* We do fxsave: this must be aligned. */
BUG_ON((unsigned long)&vcpu->host_fx_image & 0xF);
if (r < 0)
goto free_vcpu;
- return vcpu;
+ return 0;
free_vcpu:
kvm_x86_ops->vcpu_free(vcpu);
-fail:
- return ERR_PTR(r);
+ return r;
}
-void kvm_arch_vcpu_destory(struct kvm_vcpu *vcpu)
+void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
{
vcpu_load(vcpu);
kvm_mmu_unload(vcpu);
kvm_free_physmem(kvm);
kfree(kvm);
}
+
+int kvm_arch_set_memory_region(struct kvm *kvm,
+ struct kvm_userspace_memory_region *mem,
+ struct kvm_memory_slot old,
+ int user_alloc)
+{
+ int npages = mem->memory_size >> PAGE_SHIFT;
+ struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
+
+ /*To keep backward compatibility with older userspace,
+ *x86 needs to hanlde !user_alloc case.
+ */
+ if (!user_alloc) {
+ if (npages && !old.rmap) {
+ down_write(¤t->mm->mmap_sem);
+ memslot->userspace_addr = do_mmap(NULL, 0,
+ npages * PAGE_SIZE,
+ PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_ANONYMOUS,
+ 0);
+ up_write(¤t->mm->mmap_sem);
+
+ if (IS_ERR((void *)memslot->userspace_addr))
+ return PTR_ERR((void *)memslot->userspace_addr);
+ } else {
+ if (!old.user_alloc && old.rmap) {
+ int ret;
+
+ down_write(¤t->mm->mmap_sem);
+ ret = do_munmap(current->mm, old.userspace_addr,
+ old.npages * PAGE_SIZE);
+ up_write(¤t->mm->mmap_sem);
+ if (ret < 0)
+ printk(KERN_WARNING
+ "kvm_vm_ioctl_set_memory_region: "
+ "failed to munmap memory\n");
+ }
+ }
+ }
+
+ if (!kvm->n_requested_mmu_pages) {
+ unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
+ kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
+ }
+
+ kvm_mmu_slot_remove_write_access(kvm, mem->slot);
+ kvm_flush_remote_tlbs(kvm);
+
+ return 0;
+}