KVM: Enhance guest cpuid management

[linux-2.6] / drivers / kvm / x86.c
diff --git a/drivers/kvm/x86.c b/drivers/kvm/x86.c

index 4902b35060f5bff642b3e6938f0a3bac4b998a48..7237cb25f77d27030ffe3f160320a52346938a3b 100644 (file)
--- a/drivers/kvm/x86.c
+++ b/drivers/kvm/x86.c
@@ -24,6 +24,7 @@
  #include <linux/fs.h>
  #include <linux/vmalloc.h>
  #include <linux/module.h>
+#include <linux/mman.h>
  
  #include <asm/uaccess.h>
  #include <asm/msr.h>
@@ -42,26 +43,37 @@
  #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
  #define EFER_RESERVED_BITS 0xfffffffffffff2fe
  
-#define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x)
+#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
+#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
  
  struct kvm_x86_ops *kvm_x86_ops;
  
  struct kvm_stats_debugfs_item debugfs_entries[] = {
-       { "pf_fixed", STAT_OFFSET(pf_fixed) },
-       { "pf_guest", STAT_OFFSET(pf_guest) },
-       { "tlb_flush", STAT_OFFSET(tlb_flush) },
-       { "invlpg", STAT_OFFSET(invlpg) },
-       { "exits", STAT_OFFSET(exits) },
-       { "io_exits", STAT_OFFSET(io_exits) },
-       { "mmio_exits", STAT_OFFSET(mmio_exits) },
-       { "signal_exits", STAT_OFFSET(signal_exits) },
-       { "irq_window", STAT_OFFSET(irq_window_exits) },
-       { "halt_exits", STAT_OFFSET(halt_exits) },
-       { "halt_wakeup", STAT_OFFSET(halt_wakeup) },
-       { "request_irq", STAT_OFFSET(request_irq_exits) },
-       { "irq_exits", STAT_OFFSET(irq_exits) },
-       { "light_exits", STAT_OFFSET(light_exits) },
-       { "efer_reload", STAT_OFFSET(efer_reload) },
+       { "pf_fixed", VCPU_STAT(pf_fixed) },
+       { "pf_guest", VCPU_STAT(pf_guest) },
+       { "tlb_flush", VCPU_STAT(tlb_flush) },
+       { "invlpg", VCPU_STAT(invlpg) },
+       { "exits", VCPU_STAT(exits) },
+       { "io_exits", VCPU_STAT(io_exits) },
+       { "mmio_exits", VCPU_STAT(mmio_exits) },
+       { "signal_exits", VCPU_STAT(signal_exits) },
+       { "irq_window", VCPU_STAT(irq_window_exits) },
+       { "halt_exits", VCPU_STAT(halt_exits) },
+       { "halt_wakeup", VCPU_STAT(halt_wakeup) },
+       { "request_irq", VCPU_STAT(request_irq_exits) },
+       { "irq_exits", VCPU_STAT(irq_exits) },
+       { "host_state_reload", VCPU_STAT(host_state_reload) },
+       { "efer_reload", VCPU_STAT(efer_reload) },
+       { "fpu_reload", VCPU_STAT(fpu_reload) },
+       { "insn_emulation", VCPU_STAT(insn_emulation) },
+       { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
+       { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
+       { "mmu_pte_write", VM_STAT(mmu_pte_write) },
+       { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
+       { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
+       { "mmu_flooded", VM_STAT(mmu_flooded) },
+       { "mmu_recycled", VM_STAT(mmu_recycled) },
+       { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
         { NULL }
  };
  
@@ -154,6 +166,26 @@ out:
         return ret;
  }
  
+static bool pdptrs_changed(struct kvm_vcpu *vcpu)
+{
+       u64 pdpte[ARRAY_SIZE(vcpu->pdptrs)];
+       bool changed = true;
+       int r;
+
+       if (is_long_mode(vcpu) || !is_pae(vcpu))
+               return false;
+
+       mutex_lock(&vcpu->kvm->lock);
+       r = kvm_read_guest(vcpu->kvm, vcpu->cr3 & ~31u, pdpte, sizeof(pdpte));
+       if (r < 0)
+               goto out;
+       changed = memcmp(pdpte, vcpu->pdptrs, sizeof(pdpte)) != 0;
+out:
+       mutex_unlock(&vcpu->kvm->lock);
+
+       return changed;
+}
+
  void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
  {
         if (cr0 & CR0_RESERVED_BITS) {
@@ -259,6 +291,11 @@ EXPORT_SYMBOL_GPL(set_cr4);
  
  void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
  {
+       if (cr3 == vcpu->cr3 && !pdptrs_changed(vcpu)) {
+               kvm_mmu_flush_tlb(vcpu);
+               return;
+       }
+
         if (is_long_mode(vcpu)) {
                 if (cr3 & CR3_L_MODE_RESERVED_BITS) {
                         printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
@@ -599,6 +636,27 @@ void decache_vcpus_on_cpu(int cpu)
         spin_unlock(&kvm_lock);
  }
  
+int kvm_dev_ioctl_check_extension(long ext)
+{
+       int r;
+
+       switch (ext) {
+       case KVM_CAP_IRQCHIP:
+       case KVM_CAP_HLT:
+       case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
+       case KVM_CAP_USER_MEMORY:
+       case KVM_CAP_SET_TSS_ADDR:
+       case KVM_CAP_EXT_CPUID:
+               r = 1;
+               break;
+       default:
+               r = 0;
+               break;
+       }
+       return r;
+
+}
+
  long kvm_arch_dev_ioctl(struct file *filp,
                         unsigned int ioctl, unsigned long arg)
  {
@@ -648,15 +706,22 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
  void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
  {
         kvm_x86_ops->vcpu_put(vcpu);
+       kvm_put_guest_fpu(vcpu);
  }
  
-static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
+static int is_efer_nx(void)
  {
         u64 efer;
-       int i;
-       struct kvm_cpuid_entry *e, *entry;
  
         rdmsrl(MSR_EFER, efer);
+       return efer & EFER_NX;
+}
+
+static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
+{
+       int i;
+       struct kvm_cpuid_entry2 *e, *entry;
+
         entry = NULL;
         for (i = 0; i < vcpu->cpuid_nent; ++i) {
                 e = &vcpu->cpuid_entries[i];
@@ -665,15 +730,56 @@ static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
                         break;
                 }
         }
-       if (entry && (entry->edx & (1 << 20)) && !(efer & EFER_NX)) {
+       if (entry && (entry->edx & (1 << 20)) && !is_efer_nx()) {
                 entry->edx &= ~(1 << 20);
                 printk(KERN_INFO "kvm: guest NX capability removed\n");
         }
  }
  
+/* when an old userspace process fills a new kernel module */
  static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
                                     struct kvm_cpuid *cpuid,
                                     struct kvm_cpuid_entry __user *entries)
+{
+       int r, i;
+       struct kvm_cpuid_entry *cpuid_entries;
+
+       r = -E2BIG;
+       if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
+               goto out;
+       r = -ENOMEM;
+       cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry) * cpuid->nent);
+       if (!cpuid_entries)
+               goto out;
+       r = -EFAULT;
+       if (copy_from_user(cpuid_entries, entries,
+                          cpuid->nent * sizeof(struct kvm_cpuid_entry)))
+               goto out_free;
+       for (i = 0; i < cpuid->nent; i++) {
+               vcpu->cpuid_entries[i].function = cpuid_entries[i].function;
+               vcpu->cpuid_entries[i].eax = cpuid_entries[i].eax;
+               vcpu->cpuid_entries[i].ebx = cpuid_entries[i].ebx;
+               vcpu->cpuid_entries[i].ecx = cpuid_entries[i].ecx;
+               vcpu->cpuid_entries[i].edx = cpuid_entries[i].edx;
+               vcpu->cpuid_entries[i].index = 0;
+               vcpu->cpuid_entries[i].flags = 0;
+               vcpu->cpuid_entries[i].padding[0] = 0;
+               vcpu->cpuid_entries[i].padding[1] = 0;
+               vcpu->cpuid_entries[i].padding[2] = 0;
+       }
+       vcpu->cpuid_nent = cpuid->nent;
+       cpuid_fix_nx_cap(vcpu);
+       r = 0;
+
+out_free:
+       vfree(cpuid_entries);
+out:
+       return r;
+}
+
+static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
+                                   struct kvm_cpuid2 *cpuid,
+                                   struct kvm_cpuid_entry2 __user *entries)
  {
         int r;
  
@@ -682,16 +788,198 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
                 goto out;
         r = -EFAULT;
         if (copy_from_user(&vcpu->cpuid_entries, entries,
-                          cpuid->nent * sizeof(struct kvm_cpuid_entry)))
+                          cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
                 goto out;
         vcpu->cpuid_nent = cpuid->nent;
-       cpuid_fix_nx_cap(vcpu);
         return 0;
  
  out:
         return r;
  }
  
+static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
+                                   struct kvm_cpuid2 *cpuid,
+                                   struct kvm_cpuid_entry2 __user *entries)
+{
+       int r;
+
+       r = -E2BIG;
+       if (cpuid->nent < vcpu->cpuid_nent)
+               goto out;
+       r = -EFAULT;
+       if (copy_to_user(entries, &vcpu->cpuid_entries,
+                          vcpu->cpuid_nent * sizeof(struct kvm_cpuid_entry2)))
+               goto out;
+       return 0;
+
+out:
+       cpuid->nent = vcpu->cpuid_nent;
+       return r;
+}
+
+static inline u32 bit(int bitno)
+{
+       return 1 << (bitno & 31);
+}
+
+static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
+                         u32 index)
+{
+       entry->function = function;
+       entry->index = index;
+       cpuid_count(entry->function, entry->index,
+               &entry->eax, &entry->ebx, &entry->ecx, &entry->edx);
+       entry->flags = 0;
+}
+
+static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
+                        u32 index, int *nent, int maxnent)
+{
+       const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) |
+               bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
+               bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
+               bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
+               bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
+               bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) |
+               bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
+               bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) |
+               bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) |
+               bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP);
+       const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) |
+               bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
+               bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
+               bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
+               bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
+               bit(X86_FEATURE_PGE) |
+               bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
+               bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) |
+               bit(X86_FEATURE_SYSCALL) |
+               (bit(X86_FEATURE_NX) && is_efer_nx()) |
+#ifdef CONFIG_X86_64
+               bit(X86_FEATURE_LM) |
+#endif
+               bit(X86_FEATURE_MMXEXT) |
+               bit(X86_FEATURE_3DNOWEXT) |
+               bit(X86_FEATURE_3DNOW);
+       const u32 kvm_supported_word3_x86_features =
+               bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16);
+       const u32 kvm_supported_word6_x86_features =
+               bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY);
+
+       /* all func 2 cpuid_count() should be called on the same cpu */
+       get_cpu();
+       do_cpuid_1_ent(entry, function, index);
+       ++*nent;
+
+       switch (function) {
+       case 0:
+               entry->eax = min(entry->eax, (u32)0xb);
+               break;
+       case 1:
+               entry->edx &= kvm_supported_word0_x86_features;
+               entry->ecx &= kvm_supported_word3_x86_features;
+               break;
+       /* function 2 entries are STATEFUL. That is, repeated cpuid commands
+        * may return different values. This forces us to get_cpu() before
+        * issuing the first command, and also to emulate this annoying behavior
+        * in kvm_emulate_cpuid() using KVM_CPUID_FLAG_STATE_READ_NEXT */
+       case 2: {
+               int t, times = entry->eax & 0xff;
+
+               entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
+               for (t = 1; t < times && *nent < maxnent; ++t) {
+                       do_cpuid_1_ent(&entry[t], function, 0);
+                       entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
+                       ++*nent;
+               }
+               break;
+       }
+       /* function 4 and 0xb have additional index. */
+       case 4: {
+               int index, cache_type;
+
+               entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+               /* read more entries until cache_type is zero */
+               for (index = 1; *nent < maxnent; ++index) {
+                       cache_type = entry[index - 1].eax & 0x1f;
+                       if (!cache_type)
+                               break;
+                       do_cpuid_1_ent(&entry[index], function, index);
+                       entry[index].flags |=
+                              KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+                       ++*nent;
+               }
+               break;
+       }
+       case 0xb: {
+               int index, level_type;
+
+               entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+               /* read more entries until level_type is zero */
+               for (index = 1; *nent < maxnent; ++index) {
+                       level_type = entry[index - 1].ecx & 0xff;
+                       if (!level_type)
+                               break;
+                       do_cpuid_1_ent(&entry[index], function, index);
+                       entry[index].flags |=
+                              KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+                       ++*nent;
+               }
+               break;
+       }
+       case 0x80000000:
+               entry->eax = min(entry->eax, 0x8000001a);
+               break;
+       case 0x80000001:
+               entry->edx &= kvm_supported_word1_x86_features;
+               entry->ecx &= kvm_supported_word6_x86_features;
+               break;
+       }
+       put_cpu();
+}
+
+static int kvm_vm_ioctl_get_supported_cpuid(struct kvm *kvm,
+                                   struct kvm_cpuid2 *cpuid,
+                                   struct kvm_cpuid_entry2 __user *entries)
+{
+       struct kvm_cpuid_entry2 *cpuid_entries;
+       int limit, nent = 0, r = -E2BIG;
+       u32 func;
+
+       if (cpuid->nent < 1)
+               goto out;
+       r = -ENOMEM;
+       cpuid_entries = vmalloc(sizeof(struct kvm_cpuid_entry2) * cpuid->nent);
+       if (!cpuid_entries)
+               goto out;
+
+       do_cpuid_ent(&cpuid_entries[0], 0, 0, &nent, cpuid->nent);
+       limit = cpuid_entries[0].eax;
+       for (func = 1; func <= limit && nent < cpuid->nent; ++func)
+               do_cpuid_ent(&cpuid_entries[nent], func, 0,
+                               &nent, cpuid->nent);
+       r = -E2BIG;
+       if (nent >= cpuid->nent)
+               goto out_free;
+
+       do_cpuid_ent(&cpuid_entries[nent], 0x80000000, 0, &nent, cpuid->nent);
+       limit = cpuid_entries[nent - 1].eax;
+       for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func)
+               do_cpuid_ent(&cpuid_entries[nent], func, 0,
+                              &nent, cpuid->nent);
+       r = -EFAULT;
+       if (copy_to_user(entries, cpuid_entries,
+                       nent * sizeof(struct kvm_cpuid_entry2)))
+               goto out_free;
+       cpuid->nent = nent;
+       r = 0;
+
+out_free:
+       vfree(cpuid_entries);
+out:
+       return r;
+}
+
  static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
                                     struct kvm_lapic_state *s)
  {
@@ -758,6 +1046,36 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
                         goto out;
                 break;
         }
+       case KVM_SET_CPUID2: {
+               struct kvm_cpuid2 __user *cpuid_arg = argp;
+               struct kvm_cpuid2 cpuid;
+
+               r = -EFAULT;
+               if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+                       goto out;
+               r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
+                               cpuid_arg->entries);
+               if (r)
+                       goto out;
+               break;
+       }
+       case KVM_GET_CPUID2: {
+               struct kvm_cpuid2 __user *cpuid_arg = argp;
+               struct kvm_cpuid2 cpuid;
+
+               r = -EFAULT;
+               if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+                       goto out;
+               r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
+                               cpuid_arg->entries);
+               if (r)
+                       goto out;
+               r = -EFAULT;
+               if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
+                       goto out;
+               r = 0;
+               break;
+       }
         case KVM_GET_MSRS:
                 r = msr_io(vcpu, argp, kvm_get_msr, 1);
                 break;
@@ -906,6 +1224,37 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
         return r;
  }
  
+/*
+ * Get (and clear) the dirty memory log for a memory slot.
+ */
+int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
+                                     struct kvm_dirty_log *log)
+{
+       int r;
+       int n;
+       struct kvm_memory_slot *memslot;
+       int is_dirty = 0;
+
+       mutex_lock(&kvm->lock);
+
+       r = kvm_get_dirty_log(kvm, log, &is_dirty);
+       if (r)
+               goto out;
+
+       /* If nothing is dirty, don't bother messing with page tables. */
+       if (is_dirty) {
+               kvm_mmu_slot_remove_write_access(kvm, log->slot);
+               kvm_flush_remote_tlbs(kvm);
+               memslot = &kvm->memslots[log->slot];
+               n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
+               memset(memslot->dirty_bitmap, 0, n);
+       }
+       r = 0;
+out:
+       mutex_unlock(&kvm->lock);
+       return r;
+}
+
  long kvm_arch_vm_ioctl(struct file *filp,
                        unsigned int ioctl, unsigned long arg)
  {
@@ -1022,6 +1371,24 @@ long kvm_arch_vm_ioctl(struct file *filp,
                 r = 0;
                 break;
         }
+       case KVM_GET_SUPPORTED_CPUID: {
+               struct kvm_cpuid2 __user *cpuid_arg = argp;
+               struct kvm_cpuid2 cpuid;
+
+               r = -EFAULT;
+               if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
+                       goto out;
+               r = kvm_vm_ioctl_get_supported_cpuid(kvm, &cpuid,
+                       cpuid_arg->entries);
+               if (r)
+                       goto out;
+
+               r = -EFAULT;
+               if (copy_to_user(cpuid_arg, &cpuid, sizeof cpuid))
+                       goto out;
+               r = 0;
+               break;
+       }
         default:
                 ;
         }
@@ -1029,7 +1396,7 @@ out:
         return r;
  }
  
-static __init void kvm_init_msr_list(void)
+static void kvm_init_msr_list(void)
  {
         u32 dummy[2];
         unsigned i, j;
@@ -1100,15 +1467,6 @@ int emulator_read_std(unsigned long addr,
  }
  EXPORT_SYMBOL_GPL(emulator_read_std);
  
-static int emulator_write_std(unsigned long addr,
-                             const void *val,
-                             unsigned int bytes,
-                             struct kvm_vcpu *vcpu)
-{
-       pr_unimpl(vcpu, "emulator_write_std: addr %lx n %d\n", addr, bytes);
-       return X86EMUL_UNHANDLEABLE;
-}
-
  static int emulator_read_emulated(unsigned long addr,
                                   void *val,
                                   unsigned int bytes,
@@ -1305,7 +1663,6 @@ EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
  
  struct x86_emulate_ops emulate_ops = {
         .read_std            = emulator_read_std,
-       .write_std           = emulator_write_std,
         .read_emulated       = emulator_read_emulated,
         .write_emulated      = emulator_write_emulated,
         .cmpxchg_emulated    = emulator_cmpxchg_emulated,
@@ -1360,7 +1717,9 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
                                         get_segment_base(vcpu, VCPU_SREG_FS);
  
                 r = x86_decode_insn(&vcpu->emulate_ctxt, &emulate_ops);
+               ++vcpu->stat.insn_emulation;
                 if (r)  {
+                       ++vcpu->stat.insn_emulation_fail;
                         if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
                                 return EMULATE_DONE;
                         return EMULATE_FAIL;
@@ -1408,7 +1767,7 @@ static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
  
         for (i = 0; i < ARRAY_SIZE(vcpu->pio.guest_pages); ++i)
                 if (vcpu->pio.guest_pages[i]) {
-                       kvm_release_page(vcpu->pio.guest_pages[i]);
+                       kvm_release_page_dirty(vcpu->pio.guest_pages[i]);
                         vcpu->pio.guest_pages[i] = NULL;
                 }
  }
@@ -1645,9 +2004,48 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
  }
  EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
  
-__init void kvm_arch_init(void)
+int kvm_arch_init(void *opaque)
  {
+       int r;
+       struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
+
+       r = kvm_mmu_module_init();
+       if (r)
+               goto out_fail;
+
         kvm_init_msr_list();
+
+       if (kvm_x86_ops) {
+               printk(KERN_ERR "kvm: already loaded the other module\n");
+               r = -EEXIST;
+               goto out;
+       }
+
+       if (!ops->cpu_has_kvm_support()) {
+               printk(KERN_ERR "kvm: no hardware support\n");
+               r = -EOPNOTSUPP;
+               goto out;
+       }
+       if (ops->disabled_by_bios()) {
+               printk(KERN_ERR "kvm: disabled by bios\n");
+               r = -EOPNOTSUPP;
+               goto out;
+       }
+
+       kvm_x86_ops = ops;
+       kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
+       return 0;
+
+out:
+       kvm_mmu_module_exit();
+out_fail:
+       return r;
+}
+
+void kvm_arch_exit(void)
+{
+       kvm_x86_ops = NULL;
+       kvm_mmu_module_exit();
  }
  
  int kvm_emulate_halt(struct kvm_vcpu *vcpu)
@@ -1788,14 +2186,47 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
         }
  }
  
+static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
+{
+       struct kvm_cpuid_entry2 *e = &vcpu->cpuid_entries[i];
+       int j, nent = vcpu->cpuid_nent;
+
+       e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
+       /* when no next entry is found, the current entry[i] is reselected */
+       for (j = i + 1; j == i; j = (j + 1) % nent) {
+               struct kvm_cpuid_entry2 *ej = &vcpu->cpuid_entries[j];
+               if (ej->function == e->function) {
+                       ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
+                       return j;
+               }
+       }
+       return 0; /* silence gcc, even though control never reaches here */
+}
+
+/* find an entry with matching function, matching index (if needed), and that
+ * should be read next (if it's stateful) */
+static int is_matching_cpuid_entry(struct kvm_cpuid_entry2 *e,
+       u32 function, u32 index)
+{
+       if (e->function != function)
+               return 0;
+       if ((e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX) && e->index != index)
+               return 0;
+       if ((e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC) &&
+               !(e->flags & KVM_CPUID_FLAG_STATE_READ_NEXT))
+               return 0;
+       return 1;
+}
+
  void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
  {
         int i;
-       u32 function;
-       struct kvm_cpuid_entry *e, *best;
+       u32 function, index;
+       struct kvm_cpuid_entry2 *e, *best;
  
         kvm_x86_ops->cache_regs(vcpu);
         function = vcpu->regs[VCPU_REGS_RAX];
+       index = vcpu->regs[VCPU_REGS_RCX];
         vcpu->regs[VCPU_REGS_RAX] = 0;
         vcpu->regs[VCPU_REGS_RBX] = 0;
         vcpu->regs[VCPU_REGS_RCX] = 0;
@@ -1803,7 +2234,9 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
         best = NULL;
         for (i = 0; i < vcpu->cpuid_nent; ++i) {
                 e = &vcpu->cpuid_entries[i];
-               if (e->function == function) {
+               if (is_matching_cpuid_entry(e, function, index)) {
+                       if (e->flags & KVM_CPUID_FLAG_STATEFUL_FUNC)
+                               move_to_next_stateful_cpuid_entry(vcpu, i);
                         best = e;
                         break;
                 }
@@ -1943,10 +2376,8 @@ again:
                         ++vcpu->stat.request_irq_exits;
                         goto out;
                 }
-               if (!need_resched()) {
-                       ++vcpu->stat.light_exits;
+               if (!need_resched())
                         goto again;
-               }
         }
  
  out:
@@ -2273,6 +2704,28 @@ struct fxsave {
  #endif
  };
  
+/*
+ * Translate a guest virtual address to a guest physical address.
+ */
+int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
+                                   struct kvm_translation *tr)
+{
+       unsigned long vaddr = tr->linear_address;
+       gpa_t gpa;
+
+       vcpu_load(vcpu);
+       mutex_lock(&vcpu->kvm->lock);
+       gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
+       tr->physical_address = gpa;
+       tr->valid = gpa != UNMAPPED_GVA;
+       tr->writeable = 1;
+       tr->usermode = 0;
+       mutex_unlock(&vcpu->kvm->lock);
+       vcpu_put(vcpu);
+
+       return 0;
+}
+
  int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
  {
         struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image;
@@ -2352,6 +2805,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
         vcpu->guest_fpu_loaded = 0;
         fx_save(&vcpu->guest_fx_image);
         fx_restore(&vcpu->host_fx_image);
+       ++vcpu->stat.fpu_reload;
  }
  EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
  
@@ -2363,13 +2817,12 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
  struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
                                                 unsigned int id)
  {
-       int r;
-       struct kvm_vcpu *vcpu = kvm_x86_ops->vcpu_create(kvm, id);
+       return kvm_x86_ops->vcpu_create(kvm, id);
+}
  
-       if (IS_ERR(vcpu)) {
-               r = -ENOMEM;
-               goto fail;
-       }
+int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
+{
+       int r;
  
         /* We do fxsave: this must be aligned. */
         BUG_ON((unsigned long)&vcpu->host_fx_image & 0xF);
@@ -2382,14 +2835,13 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
         if (r < 0)
                 goto free_vcpu;
  
-       return vcpu;
+       return 0;
  free_vcpu:
         kvm_x86_ops->vcpu_free(vcpu);
-fail:
-       return ERR_PTR(r);
+       return r;
  }
  
-void kvm_arch_vcpu_destory(struct kvm_vcpu *vcpu)
+void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
  {
         vcpu_load(vcpu);
         kvm_mmu_unload(vcpu);
@@ -2476,3 +2928,100 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
         kvm_mmu_destroy(vcpu);
         free_page((unsigned long)vcpu->pio_data);
  }
+
+struct  kvm *kvm_arch_create_vm(void)
+{
+       struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
+
+       if (!kvm)
+               return ERR_PTR(-ENOMEM);
+
+       INIT_LIST_HEAD(&kvm->active_mmu_pages);
+
+       return kvm;
+}
+
+static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
+{
+       vcpu_load(vcpu);
+       kvm_mmu_unload(vcpu);
+       vcpu_put(vcpu);
+}
+
+static void kvm_free_vcpus(struct kvm *kvm)
+{
+       unsigned int i;
+
+       /*
+        * Unpin any mmu pages first.
+        */
+       for (i = 0; i < KVM_MAX_VCPUS; ++i)
+               if (kvm->vcpus[i])
+                       kvm_unload_vcpu_mmu(kvm->vcpus[i]);
+       for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+               if (kvm->vcpus[i]) {
+                       kvm_arch_vcpu_free(kvm->vcpus[i]);
+                       kvm->vcpus[i] = NULL;
+               }
+       }
+
+}
+
+void kvm_arch_destroy_vm(struct kvm *kvm)
+{
+       kfree(kvm->vpic);
+       kfree(kvm->vioapic);
+       kvm_free_vcpus(kvm);
+       kvm_free_physmem(kvm);
+       kfree(kvm);
+}
+
+int kvm_arch_set_memory_region(struct kvm *kvm,
+                               struct kvm_userspace_memory_region *mem,
+                               struct kvm_memory_slot old,
+                               int user_alloc)
+{
+       int npages = mem->memory_size >> PAGE_SHIFT;
+       struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
+
+       /*To keep backward compatibility with older userspace,
+        *x86 needs to hanlde !user_alloc case.
+        */
+       if (!user_alloc) {
+               if (npages && !old.rmap) {
+                       down_write(&current->mm->mmap_sem);
+                       memslot->userspace_addr = do_mmap(NULL, 0,
+                                                    npages * PAGE_SIZE,
+                                                    PROT_READ | PROT_WRITE,
+                                                    MAP_SHARED | MAP_ANONYMOUS,
+                                                    0);
+                       up_write(&current->mm->mmap_sem);
+
+                       if (IS_ERR((void *)memslot->userspace_addr))
+                               return PTR_ERR((void *)memslot->userspace_addr);
+               } else {
+                       if (!old.user_alloc && old.rmap) {
+                               int ret;
+
+                               down_write(&current->mm->mmap_sem);
+                               ret = do_munmap(current->mm, old.userspace_addr,
+                                               old.npages * PAGE_SIZE);
+                               up_write(&current->mm->mmap_sem);
+                               if (ret < 0)
+                                       printk(KERN_WARNING
+                                      "kvm_vm_ioctl_set_memory_region: "
+                                      "failed to munmap memory\n");
+                       }
+               }
+       }
+
+       if (!kvm->n_requested_mmu_pages) {
+               unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
+               kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
+       }
+
+       kvm_mmu_slot_remove_write_access(kvm, mem->slot);
+       kvm_flush_remote_tlbs(kvm);
+
+       return 0;
+}