]> err.no Git - linux-2.6/blob - drivers/kvm/x86.c
KVM: Don't bother the mmu if cr3 load doesn't change cr3
[linux-2.6] / drivers / kvm / x86.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * derived from drivers/kvm/kvm_main.c
5  *
6  * Copyright (C) 2006 Qumranet, Inc.
7  *
8  * Authors:
9  *   Avi Kivity   <avi@qumranet.com>
10  *   Yaniv Kamay  <yaniv@qumranet.com>
11  *
12  * This work is licensed under the terms of the GNU GPL, version 2.  See
13  * the COPYING file in the top-level directory.
14  *
15  */
16
17 #include "kvm.h"
18 #include "x86.h"
19 #include "x86_emulate.h"
20 #include "segment_descriptor.h"
21 #include "irq.h"
22
23 #include <linux/kvm.h>
24 #include <linux/fs.h>
25 #include <linux/vmalloc.h>
26 #include <linux/module.h>
27 #include <linux/mman.h>
28
29 #include <asm/uaccess.h>
30 #include <asm/msr.h>
31
32 #define MAX_IO_MSRS 256
33 #define CR0_RESERVED_BITS                                               \
34         (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
35                           | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
36                           | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
37 #define CR4_RESERVED_BITS                                               \
38         (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
39                           | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE     \
40                           | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR  \
41                           | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
42
43 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
44 #define EFER_RESERVED_BITS 0xfffffffffffff2fe
45
46 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
47 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
48
49 struct kvm_x86_ops *kvm_x86_ops;
50
51 struct kvm_stats_debugfs_item debugfs_entries[] = {
52         { "pf_fixed", VCPU_STAT(pf_fixed) },
53         { "pf_guest", VCPU_STAT(pf_guest) },
54         { "tlb_flush", VCPU_STAT(tlb_flush) },
55         { "invlpg", VCPU_STAT(invlpg) },
56         { "exits", VCPU_STAT(exits) },
57         { "io_exits", VCPU_STAT(io_exits) },
58         { "mmio_exits", VCPU_STAT(mmio_exits) },
59         { "signal_exits", VCPU_STAT(signal_exits) },
60         { "irq_window", VCPU_STAT(irq_window_exits) },
61         { "halt_exits", VCPU_STAT(halt_exits) },
62         { "halt_wakeup", VCPU_STAT(halt_wakeup) },
63         { "request_irq", VCPU_STAT(request_irq_exits) },
64         { "irq_exits", VCPU_STAT(irq_exits) },
65         { "host_state_reload", VCPU_STAT(host_state_reload) },
66         { "efer_reload", VCPU_STAT(efer_reload) },
67         { "fpu_reload", VCPU_STAT(fpu_reload) },
68         { "insn_emulation", VCPU_STAT(insn_emulation) },
69         { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
70         { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
71         { "mmu_pte_write", VM_STAT(mmu_pte_write) },
72         { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
73         { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
74         { "mmu_flooded", VM_STAT(mmu_flooded) },
75         { "mmu_recycled", VM_STAT(mmu_recycled) },
76         { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
77         { NULL }
78 };
79
80
81 unsigned long segment_base(u16 selector)
82 {
83         struct descriptor_table gdt;
84         struct segment_descriptor *d;
85         unsigned long table_base;
86         unsigned long v;
87
88         if (selector == 0)
89                 return 0;
90
91         asm("sgdt %0" : "=m"(gdt));
92         table_base = gdt.base;
93
94         if (selector & 4) {           /* from ldt */
95                 u16 ldt_selector;
96
97                 asm("sldt %0" : "=g"(ldt_selector));
98                 table_base = segment_base(ldt_selector);
99         }
100         d = (struct segment_descriptor *)(table_base + (selector & ~7));
101         v = d->base_low | ((unsigned long)d->base_mid << 16) |
102                 ((unsigned long)d->base_high << 24);
103 #ifdef CONFIG_X86_64
104         if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
105                 v |= ((unsigned long) \
106                       ((struct segment_descriptor_64 *)d)->base_higher) << 32;
107 #endif
108         return v;
109 }
110 EXPORT_SYMBOL_GPL(segment_base);
111
112 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
113 {
114         if (irqchip_in_kernel(vcpu->kvm))
115                 return vcpu->apic_base;
116         else
117                 return vcpu->apic_base;
118 }
119 EXPORT_SYMBOL_GPL(kvm_get_apic_base);
120
121 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
122 {
123         /* TODO: reserve bits check */
124         if (irqchip_in_kernel(vcpu->kvm))
125                 kvm_lapic_set_base(vcpu, data);
126         else
127                 vcpu->apic_base = data;
128 }
129 EXPORT_SYMBOL_GPL(kvm_set_apic_base);
130
131 static void inject_gp(struct kvm_vcpu *vcpu)
132 {
133         kvm_x86_ops->inject_gp(vcpu, 0);
134 }
135
136 /*
137  * Load the pae pdptrs.  Return true is they are all valid.
138  */
139 int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
140 {
141         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
142         unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
143         int i;
144         int ret;
145         u64 pdpte[ARRAY_SIZE(vcpu->pdptrs)];
146
147         mutex_lock(&vcpu->kvm->lock);
148         ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
149                                   offset * sizeof(u64), sizeof(pdpte));
150         if (ret < 0) {
151                 ret = 0;
152                 goto out;
153         }
154         for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
155                 if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
156                         ret = 0;
157                         goto out;
158                 }
159         }
160         ret = 1;
161
162         memcpy(vcpu->pdptrs, pdpte, sizeof(vcpu->pdptrs));
163 out:
164         mutex_unlock(&vcpu->kvm->lock);
165
166         return ret;
167 }
168
169 static bool pdptrs_changed(struct kvm_vcpu *vcpu)
170 {
171         u64 pdpte[ARRAY_SIZE(vcpu->pdptrs)];
172         bool changed = true;
173         int r;
174
175         if (is_long_mode(vcpu) || !is_pae(vcpu))
176                 return false;
177
178         mutex_lock(&vcpu->kvm->lock);
179         r = kvm_read_guest(vcpu->kvm, vcpu->cr3 & ~31u, pdpte, sizeof(pdpte));
180         if (r < 0)
181                 goto out;
182         changed = memcmp(pdpte, vcpu->pdptrs, sizeof(pdpte)) != 0;
183 out:
184         mutex_unlock(&vcpu->kvm->lock);
185
186         return changed;
187 }
188
189 void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
190 {
191         if (cr0 & CR0_RESERVED_BITS) {
192                 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
193                        cr0, vcpu->cr0);
194                 inject_gp(vcpu);
195                 return;
196         }
197
198         if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
199                 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
200                 inject_gp(vcpu);
201                 return;
202         }
203
204         if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
205                 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
206                        "and a clear PE flag\n");
207                 inject_gp(vcpu);
208                 return;
209         }
210
211         if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
212 #ifdef CONFIG_X86_64
213                 if ((vcpu->shadow_efer & EFER_LME)) {
214                         int cs_db, cs_l;
215
216                         if (!is_pae(vcpu)) {
217                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
218                                        "in long mode while PAE is disabled\n");
219                                 inject_gp(vcpu);
220                                 return;
221                         }
222                         kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
223                         if (cs_l) {
224                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
225                                        "in long mode while CS.L == 1\n");
226                                 inject_gp(vcpu);
227                                 return;
228
229                         }
230                 } else
231 #endif
232                 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) {
233                         printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
234                                "reserved bits\n");
235                         inject_gp(vcpu);
236                         return;
237                 }
238
239         }
240
241         kvm_x86_ops->set_cr0(vcpu, cr0);
242         vcpu->cr0 = cr0;
243
244         mutex_lock(&vcpu->kvm->lock);
245         kvm_mmu_reset_context(vcpu);
246         mutex_unlock(&vcpu->kvm->lock);
247         return;
248 }
249 EXPORT_SYMBOL_GPL(set_cr0);
250
251 void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
252 {
253         set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f));
254 }
255 EXPORT_SYMBOL_GPL(lmsw);
256
257 void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
258 {
259         if (cr4 & CR4_RESERVED_BITS) {
260                 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
261                 inject_gp(vcpu);
262                 return;
263         }
264
265         if (is_long_mode(vcpu)) {
266                 if (!(cr4 & X86_CR4_PAE)) {
267                         printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
268                                "in long mode\n");
269                         inject_gp(vcpu);
270                         return;
271                 }
272         } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE)
273                    && !load_pdptrs(vcpu, vcpu->cr3)) {
274                 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
275                 inject_gp(vcpu);
276                 return;
277         }
278
279         if (cr4 & X86_CR4_VMXE) {
280                 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
281                 inject_gp(vcpu);
282                 return;
283         }
284         kvm_x86_ops->set_cr4(vcpu, cr4);
285         vcpu->cr4 = cr4;
286         mutex_lock(&vcpu->kvm->lock);
287         kvm_mmu_reset_context(vcpu);
288         mutex_unlock(&vcpu->kvm->lock);
289 }
290 EXPORT_SYMBOL_GPL(set_cr4);
291
292 void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
293 {
294         if (cr3 == vcpu->cr3 && !pdptrs_changed(vcpu)) {
295                 kvm_mmu_flush_tlb(vcpu);
296                 return;
297         }
298
299         if (is_long_mode(vcpu)) {
300                 if (cr3 & CR3_L_MODE_RESERVED_BITS) {
301                         printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
302                         inject_gp(vcpu);
303                         return;
304                 }
305         } else {
306                 if (is_pae(vcpu)) {
307                         if (cr3 & CR3_PAE_RESERVED_BITS) {
308                                 printk(KERN_DEBUG
309                                        "set_cr3: #GP, reserved bits\n");
310                                 inject_gp(vcpu);
311                                 return;
312                         }
313                         if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
314                                 printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
315                                        "reserved bits\n");
316                                 inject_gp(vcpu);
317                                 return;
318                         }
319                 }
320                 /*
321                  * We don't check reserved bits in nonpae mode, because
322                  * this isn't enforced, and VMware depends on this.
323                  */
324         }
325
326         mutex_lock(&vcpu->kvm->lock);
327         /*
328          * Does the new cr3 value map to physical memory? (Note, we
329          * catch an invalid cr3 even in real-mode, because it would
330          * cause trouble later on when we turn on paging anyway.)
331          *
332          * A real CPU would silently accept an invalid cr3 and would
333          * attempt to use it - with largely undefined (and often hard
334          * to debug) behavior on the guest side.
335          */
336         if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
337                 inject_gp(vcpu);
338         else {
339                 vcpu->cr3 = cr3;
340                 vcpu->mmu.new_cr3(vcpu);
341         }
342         mutex_unlock(&vcpu->kvm->lock);
343 }
344 EXPORT_SYMBOL_GPL(set_cr3);
345
346 void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
347 {
348         if (cr8 & CR8_RESERVED_BITS) {
349                 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
350                 inject_gp(vcpu);
351                 return;
352         }
353         if (irqchip_in_kernel(vcpu->kvm))
354                 kvm_lapic_set_tpr(vcpu, cr8);
355         else
356                 vcpu->cr8 = cr8;
357 }
358 EXPORT_SYMBOL_GPL(set_cr8);
359
360 unsigned long get_cr8(struct kvm_vcpu *vcpu)
361 {
362         if (irqchip_in_kernel(vcpu->kvm))
363                 return kvm_lapic_get_cr8(vcpu);
364         else
365                 return vcpu->cr8;
366 }
367 EXPORT_SYMBOL_GPL(get_cr8);
368
369 /*
370  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
371  * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
372  *
373  * This list is modified at module load time to reflect the
374  * capabilities of the host cpu.
375  */
376 static u32 msrs_to_save[] = {
377         MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
378         MSR_K6_STAR,
379 #ifdef CONFIG_X86_64
380         MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
381 #endif
382         MSR_IA32_TIME_STAMP_COUNTER,
383 };
384
385 static unsigned num_msrs_to_save;
386
387 static u32 emulated_msrs[] = {
388         MSR_IA32_MISC_ENABLE,
389 };
390
391 #ifdef CONFIG_X86_64
392
393 static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
394 {
395         if (efer & EFER_RESERVED_BITS) {
396                 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
397                        efer);
398                 inject_gp(vcpu);
399                 return;
400         }
401
402         if (is_paging(vcpu)
403             && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
404                 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
405                 inject_gp(vcpu);
406                 return;
407         }
408
409         kvm_x86_ops->set_efer(vcpu, efer);
410
411         efer &= ~EFER_LMA;
412         efer |= vcpu->shadow_efer & EFER_LMA;
413
414         vcpu->shadow_efer = efer;
415 }
416
417 #endif
418
419 /*
420  * Writes msr value into into the appropriate "register".
421  * Returns 0 on success, non-0 otherwise.
422  * Assumes vcpu_load() was already called.
423  */
424 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
425 {
426         return kvm_x86_ops->set_msr(vcpu, msr_index, data);
427 }
428
429 /*
430  * Adapt set_msr() to msr_io()'s calling convention
431  */
432 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
433 {
434         return kvm_set_msr(vcpu, index, *data);
435 }
436
437
438 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
439 {
440         switch (msr) {
441 #ifdef CONFIG_X86_64
442         case MSR_EFER:
443                 set_efer(vcpu, data);
444                 break;
445 #endif
446         case MSR_IA32_MC0_STATUS:
447                 pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
448                        __FUNCTION__, data);
449                 break;
450         case MSR_IA32_MCG_STATUS:
451                 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
452                         __FUNCTION__, data);
453                 break;
454         case MSR_IA32_UCODE_REV:
455         case MSR_IA32_UCODE_WRITE:
456         case 0x200 ... 0x2ff: /* MTRRs */
457                 break;
458         case MSR_IA32_APICBASE:
459                 kvm_set_apic_base(vcpu, data);
460                 break;
461         case MSR_IA32_MISC_ENABLE:
462                 vcpu->ia32_misc_enable_msr = data;
463                 break;
464         default:
465                 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x\n", msr);
466                 return 1;
467         }
468         return 0;
469 }
470 EXPORT_SYMBOL_GPL(kvm_set_msr_common);
471
472
473 /*
474  * Reads an msr value (of 'msr_index') into 'pdata'.
475  * Returns 0 on success, non-0 otherwise.
476  * Assumes vcpu_load() was already called.
477  */
478 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
479 {
480         return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
481 }
482
483 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
484 {
485         u64 data;
486
487         switch (msr) {
488         case 0xc0010010: /* SYSCFG */
489         case 0xc0010015: /* HWCR */
490         case MSR_IA32_PLATFORM_ID:
491         case MSR_IA32_P5_MC_ADDR:
492         case MSR_IA32_P5_MC_TYPE:
493         case MSR_IA32_MC0_CTL:
494         case MSR_IA32_MCG_STATUS:
495         case MSR_IA32_MCG_CAP:
496         case MSR_IA32_MC0_MISC:
497         case MSR_IA32_MC0_MISC+4:
498         case MSR_IA32_MC0_MISC+8:
499         case MSR_IA32_MC0_MISC+12:
500         case MSR_IA32_MC0_MISC+16:
501         case MSR_IA32_UCODE_REV:
502         case MSR_IA32_PERF_STATUS:
503         case MSR_IA32_EBL_CR_POWERON:
504                 /* MTRR registers */
505         case 0xfe:
506         case 0x200 ... 0x2ff:
507                 data = 0;
508                 break;
509         case 0xcd: /* fsb frequency */
510                 data = 3;
511                 break;
512         case MSR_IA32_APICBASE:
513                 data = kvm_get_apic_base(vcpu);
514                 break;
515         case MSR_IA32_MISC_ENABLE:
516                 data = vcpu->ia32_misc_enable_msr;
517                 break;
518 #ifdef CONFIG_X86_64
519         case MSR_EFER:
520                 data = vcpu->shadow_efer;
521                 break;
522 #endif
523         default:
524                 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
525                 return 1;
526         }
527         *pdata = data;
528         return 0;
529 }
530 EXPORT_SYMBOL_GPL(kvm_get_msr_common);
531
532 /*
533  * Read or write a bunch of msrs. All parameters are kernel addresses.
534  *
535  * @return number of msrs set successfully.
536  */
537 static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
538                     struct kvm_msr_entry *entries,
539                     int (*do_msr)(struct kvm_vcpu *vcpu,
540                                   unsigned index, u64 *data))
541 {
542         int i;
543
544         vcpu_load(vcpu);
545
546         for (i = 0; i < msrs->nmsrs; ++i)
547                 if (do_msr(vcpu, entries[i].index, &entries[i].data))
548                         break;
549
550         vcpu_put(vcpu);
551
552         return i;
553 }
554
555 /*
556  * Read or write a bunch of msrs. Parameters are user addresses.
557  *
558  * @return number of msrs set successfully.
559  */
560 static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs,
561                   int (*do_msr)(struct kvm_vcpu *vcpu,
562                                 unsigned index, u64 *data),
563                   int writeback)
564 {
565         struct kvm_msrs msrs;
566         struct kvm_msr_entry *entries;
567         int r, n;
568         unsigned size;
569
570         r = -EFAULT;
571         if (copy_from_user(&msrs, user_msrs, sizeof msrs))
572                 goto out;
573
574         r = -E2BIG;
575         if (msrs.nmsrs >= MAX_IO_MSRS)
576                 goto out;
577
578         r = -ENOMEM;
579         size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
580         entries = vmalloc(size);
581         if (!entries)
582                 goto out;
583
584         r = -EFAULT;
585         if (copy_from_user(entries, user_msrs->entries, size))
586                 goto out_free;
587
588         r = n = __msr_io(vcpu, &msrs, entries, do_msr);
589         if (r < 0)
590                 goto out_free;
591
592         r = -EFAULT;
593         if (writeback && copy_to_user(user_msrs->entries, entries, size))
594                 goto out_free;
595
596         r = n;
597
598 out_free:
599         vfree(entries);
600 out:
601         return r;
602 }
603
604 /*
605  * Make sure that a cpu that is being hot-unplugged does not have any vcpus
606  * cached on it.
607  */
608 void decache_vcpus_on_cpu(int cpu)
609 {
610         struct kvm *vm;
611         struct kvm_vcpu *vcpu;
612         int i;
613
614         spin_lock(&kvm_lock);
615         list_for_each_entry(vm, &vm_list, vm_list)
616                 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
617                         vcpu = vm->vcpus[i];
618                         if (!vcpu)
619                                 continue;
620                         /*
621                          * If the vcpu is locked, then it is running on some
622                          * other cpu and therefore it is not cached on the
623                          * cpu in question.
624                          *
625                          * If it's not locked, check the last cpu it executed
626                          * on.
627                          */
628                         if (mutex_trylock(&vcpu->mutex)) {
629                                 if (vcpu->cpu == cpu) {
630                                         kvm_x86_ops->vcpu_decache(vcpu);
631                                         vcpu->cpu = -1;
632                                 }
633                                 mutex_unlock(&vcpu->mutex);
634                         }
635                 }
636         spin_unlock(&kvm_lock);
637 }
638
639 int kvm_dev_ioctl_check_extension(long ext)
640 {
641         int r;
642
643         switch (ext) {
644         case KVM_CAP_IRQCHIP:
645         case KVM_CAP_HLT:
646         case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
647         case KVM_CAP_USER_MEMORY:
648         case KVM_CAP_SET_TSS_ADDR:
649                 r = 1;
650                 break;
651         default:
652                 r = 0;
653                 break;
654         }
655         return r;
656
657 }
658
659 long kvm_arch_dev_ioctl(struct file *filp,
660                         unsigned int ioctl, unsigned long arg)
661 {
662         void __user *argp = (void __user *)arg;
663         long r;
664
665         switch (ioctl) {
666         case KVM_GET_MSR_INDEX_LIST: {
667                 struct kvm_msr_list __user *user_msr_list = argp;
668                 struct kvm_msr_list msr_list;
669                 unsigned n;
670
671                 r = -EFAULT;
672                 if (copy_from_user(&msr_list, user_msr_list, sizeof msr_list))
673                         goto out;
674                 n = msr_list.nmsrs;
675                 msr_list.nmsrs = num_msrs_to_save + ARRAY_SIZE(emulated_msrs);
676                 if (copy_to_user(user_msr_list, &msr_list, sizeof msr_list))
677                         goto out;
678                 r = -E2BIG;
679                 if (n < num_msrs_to_save)
680                         goto out;
681                 r = -EFAULT;
682                 if (copy_to_user(user_msr_list->indices, &msrs_to_save,
683                                  num_msrs_to_save * sizeof(u32)))
684                         goto out;
685                 if (copy_to_user(user_msr_list->indices
686                                  + num_msrs_to_save * sizeof(u32),
687                                  &emulated_msrs,
688                                  ARRAY_SIZE(emulated_msrs) * sizeof(u32)))
689                         goto out;
690                 r = 0;
691                 break;
692         }
693         default:
694                 r = -EINVAL;
695         }
696 out:
697         return r;
698 }
699
700 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
701 {
702         kvm_x86_ops->vcpu_load(vcpu, cpu);
703 }
704
705 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
706 {
707         kvm_x86_ops->vcpu_put(vcpu);
708         kvm_put_guest_fpu(vcpu);
709 }
710
711 static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
712 {
713         u64 efer;
714         int i;
715         struct kvm_cpuid_entry *e, *entry;
716
717         rdmsrl(MSR_EFER, efer);
718         entry = NULL;
719         for (i = 0; i < vcpu->cpuid_nent; ++i) {
720                 e = &vcpu->cpuid_entries[i];
721                 if (e->function == 0x80000001) {
722                         entry = e;
723                         break;
724                 }
725         }
726         if (entry && (entry->edx & (1 << 20)) && !(efer & EFER_NX)) {
727                 entry->edx &= ~(1 << 20);
728                 printk(KERN_INFO "kvm: guest NX capability removed\n");
729         }
730 }
731
732 static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
733                                     struct kvm_cpuid *cpuid,
734                                     struct kvm_cpuid_entry __user *entries)
735 {
736         int r;
737
738         r = -E2BIG;
739         if (cpuid->nent > KVM_MAX_CPUID_ENTRIES)
740                 goto out;
741         r = -EFAULT;
742         if (copy_from_user(&vcpu->cpuid_entries, entries,
743                            cpuid->nent * sizeof(struct kvm_cpuid_entry)))
744                 goto out;
745         vcpu->cpuid_nent = cpuid->nent;
746         cpuid_fix_nx_cap(vcpu);
747         return 0;
748
749 out:
750         return r;
751 }
752
753 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
754                                     struct kvm_lapic_state *s)
755 {
756         vcpu_load(vcpu);
757         memcpy(s->regs, vcpu->apic->regs, sizeof *s);
758         vcpu_put(vcpu);
759
760         return 0;
761 }
762
763 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
764                                     struct kvm_lapic_state *s)
765 {
766         vcpu_load(vcpu);
767         memcpy(vcpu->apic->regs, s->regs, sizeof *s);
768         kvm_apic_post_state_restore(vcpu);
769         vcpu_put(vcpu);
770
771         return 0;
772 }
773
774 long kvm_arch_vcpu_ioctl(struct file *filp,
775                          unsigned int ioctl, unsigned long arg)
776 {
777         struct kvm_vcpu *vcpu = filp->private_data;
778         void __user *argp = (void __user *)arg;
779         int r;
780
781         switch (ioctl) {
782         case KVM_GET_LAPIC: {
783                 struct kvm_lapic_state lapic;
784
785                 memset(&lapic, 0, sizeof lapic);
786                 r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic);
787                 if (r)
788                         goto out;
789                 r = -EFAULT;
790                 if (copy_to_user(argp, &lapic, sizeof lapic))
791                         goto out;
792                 r = 0;
793                 break;
794         }
795         case KVM_SET_LAPIC: {
796                 struct kvm_lapic_state lapic;
797
798                 r = -EFAULT;
799                 if (copy_from_user(&lapic, argp, sizeof lapic))
800                         goto out;
801                 r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);;
802                 if (r)
803                         goto out;
804                 r = 0;
805                 break;
806         }
807         case KVM_SET_CPUID: {
808                 struct kvm_cpuid __user *cpuid_arg = argp;
809                 struct kvm_cpuid cpuid;
810
811                 r = -EFAULT;
812                 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid))
813                         goto out;
814                 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
815                 if (r)
816                         goto out;
817                 break;
818         }
819         case KVM_GET_MSRS:
820                 r = msr_io(vcpu, argp, kvm_get_msr, 1);
821                 break;
822         case KVM_SET_MSRS:
823                 r = msr_io(vcpu, argp, do_set_msr, 0);
824                 break;
825         default:
826                 r = -EINVAL;
827         }
828 out:
829         return r;
830 }
831
832 static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
833 {
834         int ret;
835
836         if (addr > (unsigned int)(-3 * PAGE_SIZE))
837                 return -1;
838         ret = kvm_x86_ops->set_tss_addr(kvm, addr);
839         return ret;
840 }
841
842 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
843                                           u32 kvm_nr_mmu_pages)
844 {
845         if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
846                 return -EINVAL;
847
848         mutex_lock(&kvm->lock);
849
850         kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
851         kvm->n_requested_mmu_pages = kvm_nr_mmu_pages;
852
853         mutex_unlock(&kvm->lock);
854         return 0;
855 }
856
857 static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
858 {
859         return kvm->n_alloc_mmu_pages;
860 }
861
862 /*
863  * Set a new alias region.  Aliases map a portion of physical memory into
864  * another portion.  This is useful for memory windows, for example the PC
865  * VGA region.
866  */
867 static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
868                                          struct kvm_memory_alias *alias)
869 {
870         int r, n;
871         struct kvm_mem_alias *p;
872
873         r = -EINVAL;
874         /* General sanity checks */
875         if (alias->memory_size & (PAGE_SIZE - 1))
876                 goto out;
877         if (alias->guest_phys_addr & (PAGE_SIZE - 1))
878                 goto out;
879         if (alias->slot >= KVM_ALIAS_SLOTS)
880                 goto out;
881         if (alias->guest_phys_addr + alias->memory_size
882             < alias->guest_phys_addr)
883                 goto out;
884         if (alias->target_phys_addr + alias->memory_size
885             < alias->target_phys_addr)
886                 goto out;
887
888         mutex_lock(&kvm->lock);
889
890         p = &kvm->aliases[alias->slot];
891         p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
892         p->npages = alias->memory_size >> PAGE_SHIFT;
893         p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
894
895         for (n = KVM_ALIAS_SLOTS; n > 0; --n)
896                 if (kvm->aliases[n - 1].npages)
897                         break;
898         kvm->naliases = n;
899
900         kvm_mmu_zap_all(kvm);
901
902         mutex_unlock(&kvm->lock);
903
904         return 0;
905
906 out:
907         return r;
908 }
909
910 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
911 {
912         int r;
913
914         r = 0;
915         switch (chip->chip_id) {
916         case KVM_IRQCHIP_PIC_MASTER:
917                 memcpy(&chip->chip.pic,
918                         &pic_irqchip(kvm)->pics[0],
919                         sizeof(struct kvm_pic_state));
920                 break;
921         case KVM_IRQCHIP_PIC_SLAVE:
922                 memcpy(&chip->chip.pic,
923                         &pic_irqchip(kvm)->pics[1],
924                         sizeof(struct kvm_pic_state));
925                 break;
926         case KVM_IRQCHIP_IOAPIC:
927                 memcpy(&chip->chip.ioapic,
928                         ioapic_irqchip(kvm),
929                         sizeof(struct kvm_ioapic_state));
930                 break;
931         default:
932                 r = -EINVAL;
933                 break;
934         }
935         return r;
936 }
937
938 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
939 {
940         int r;
941
942         r = 0;
943         switch (chip->chip_id) {
944         case KVM_IRQCHIP_PIC_MASTER:
945                 memcpy(&pic_irqchip(kvm)->pics[0],
946                         &chip->chip.pic,
947                         sizeof(struct kvm_pic_state));
948                 break;
949         case KVM_IRQCHIP_PIC_SLAVE:
950                 memcpy(&pic_irqchip(kvm)->pics[1],
951                         &chip->chip.pic,
952                         sizeof(struct kvm_pic_state));
953                 break;
954         case KVM_IRQCHIP_IOAPIC:
955                 memcpy(ioapic_irqchip(kvm),
956                         &chip->chip.ioapic,
957                         sizeof(struct kvm_ioapic_state));
958                 break;
959         default:
960                 r = -EINVAL;
961                 break;
962         }
963         kvm_pic_update_irq(pic_irqchip(kvm));
964         return r;
965 }
966
967 /*
968  * Get (and clear) the dirty memory log for a memory slot.
969  */
970 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
971                                       struct kvm_dirty_log *log)
972 {
973         int r;
974         int n;
975         struct kvm_memory_slot *memslot;
976         int is_dirty = 0;
977
978         mutex_lock(&kvm->lock);
979
980         r = kvm_get_dirty_log(kvm, log, &is_dirty);
981         if (r)
982                 goto out;
983
984         /* If nothing is dirty, don't bother messing with page tables. */
985         if (is_dirty) {
986                 kvm_mmu_slot_remove_write_access(kvm, log->slot);
987                 kvm_flush_remote_tlbs(kvm);
988                 memslot = &kvm->memslots[log->slot];
989                 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
990                 memset(memslot->dirty_bitmap, 0, n);
991         }
992         r = 0;
993 out:
994         mutex_unlock(&kvm->lock);
995         return r;
996 }
997
998 long kvm_arch_vm_ioctl(struct file *filp,
999                        unsigned int ioctl, unsigned long arg)
1000 {
1001         struct kvm *kvm = filp->private_data;
1002         void __user *argp = (void __user *)arg;
1003         int r = -EINVAL;
1004
1005         switch (ioctl) {
1006         case KVM_SET_TSS_ADDR:
1007                 r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
1008                 if (r < 0)
1009                         goto out;
1010                 break;
1011         case KVM_SET_MEMORY_REGION: {
1012                 struct kvm_memory_region kvm_mem;
1013                 struct kvm_userspace_memory_region kvm_userspace_mem;
1014
1015                 r = -EFAULT;
1016                 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
1017                         goto out;
1018                 kvm_userspace_mem.slot = kvm_mem.slot;
1019                 kvm_userspace_mem.flags = kvm_mem.flags;
1020                 kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
1021                 kvm_userspace_mem.memory_size = kvm_mem.memory_size;
1022                 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
1023                 if (r)
1024                         goto out;
1025                 break;
1026         }
1027         case KVM_SET_NR_MMU_PAGES:
1028                 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
1029                 if (r)
1030                         goto out;
1031                 break;
1032         case KVM_GET_NR_MMU_PAGES:
1033                 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
1034                 break;
1035         case KVM_SET_MEMORY_ALIAS: {
1036                 struct kvm_memory_alias alias;
1037
1038                 r = -EFAULT;
1039                 if (copy_from_user(&alias, argp, sizeof alias))
1040                         goto out;
1041                 r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
1042                 if (r)
1043                         goto out;
1044                 break;
1045         }
1046         case KVM_CREATE_IRQCHIP:
1047                 r = -ENOMEM;
1048                 kvm->vpic = kvm_create_pic(kvm);
1049                 if (kvm->vpic) {
1050                         r = kvm_ioapic_init(kvm);
1051                         if (r) {
1052                                 kfree(kvm->vpic);
1053                                 kvm->vpic = NULL;
1054                                 goto out;
1055                         }
1056                 } else
1057                         goto out;
1058                 break;
1059         case KVM_IRQ_LINE: {
1060                 struct kvm_irq_level irq_event;
1061
1062                 r = -EFAULT;
1063                 if (copy_from_user(&irq_event, argp, sizeof irq_event))
1064                         goto out;
1065                 if (irqchip_in_kernel(kvm)) {
1066                         mutex_lock(&kvm->lock);
1067                         if (irq_event.irq < 16)
1068                                 kvm_pic_set_irq(pic_irqchip(kvm),
1069                                         irq_event.irq,
1070                                         irq_event.level);
1071                         kvm_ioapic_set_irq(kvm->vioapic,
1072                                         irq_event.irq,
1073                                         irq_event.level);
1074                         mutex_unlock(&kvm->lock);
1075                         r = 0;
1076                 }
1077                 break;
1078         }
1079         case KVM_GET_IRQCHIP: {
1080                 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1081                 struct kvm_irqchip chip;
1082
1083                 r = -EFAULT;
1084                 if (copy_from_user(&chip, argp, sizeof chip))
1085                         goto out;
1086                 r = -ENXIO;
1087                 if (!irqchip_in_kernel(kvm))
1088                         goto out;
1089                 r = kvm_vm_ioctl_get_irqchip(kvm, &chip);
1090                 if (r)
1091                         goto out;
1092                 r = -EFAULT;
1093                 if (copy_to_user(argp, &chip, sizeof chip))
1094                         goto out;
1095                 r = 0;
1096                 break;
1097         }
1098         case KVM_SET_IRQCHIP: {
1099                 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1100                 struct kvm_irqchip chip;
1101
1102                 r = -EFAULT;
1103                 if (copy_from_user(&chip, argp, sizeof chip))
1104                         goto out;
1105                 r = -ENXIO;
1106                 if (!irqchip_in_kernel(kvm))
1107                         goto out;
1108                 r = kvm_vm_ioctl_set_irqchip(kvm, &chip);
1109                 if (r)
1110                         goto out;
1111                 r = 0;
1112                 break;
1113         }
1114         default:
1115                 ;
1116         }
1117 out:
1118         return r;
1119 }
1120
1121 static void kvm_init_msr_list(void)
1122 {
1123         u32 dummy[2];
1124         unsigned i, j;
1125
1126         for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
1127                 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
1128                         continue;
1129                 if (j < i)
1130                         msrs_to_save[j] = msrs_to_save[i];
1131                 j++;
1132         }
1133         num_msrs_to_save = j;
1134 }
1135
1136 /*
1137  * Only apic need an MMIO device hook, so shortcut now..
1138  */
1139 static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
1140                                                 gpa_t addr)
1141 {
1142         struct kvm_io_device *dev;
1143
1144         if (vcpu->apic) {
1145                 dev = &vcpu->apic->dev;
1146                 if (dev->in_range(dev, addr))
1147                         return dev;
1148         }
1149         return NULL;
1150 }
1151
1152
1153 static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
1154                                                 gpa_t addr)
1155 {
1156         struct kvm_io_device *dev;
1157
1158         dev = vcpu_find_pervcpu_dev(vcpu, addr);
1159         if (dev == NULL)
1160                 dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
1161         return dev;
1162 }
1163
1164 int emulator_read_std(unsigned long addr,
1165                              void *val,
1166                              unsigned int bytes,
1167                              struct kvm_vcpu *vcpu)
1168 {
1169         void *data = val;
1170
1171         while (bytes) {
1172                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1173                 unsigned offset = addr & (PAGE_SIZE-1);
1174                 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
1175                 int ret;
1176
1177                 if (gpa == UNMAPPED_GVA)
1178                         return X86EMUL_PROPAGATE_FAULT;
1179                 ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy);
1180                 if (ret < 0)
1181                         return X86EMUL_UNHANDLEABLE;
1182
1183                 bytes -= tocopy;
1184                 data += tocopy;
1185                 addr += tocopy;
1186         }
1187
1188         return X86EMUL_CONTINUE;
1189 }
1190 EXPORT_SYMBOL_GPL(emulator_read_std);
1191
1192 static int emulator_read_emulated(unsigned long addr,
1193                                   void *val,
1194                                   unsigned int bytes,
1195                                   struct kvm_vcpu *vcpu)
1196 {
1197         struct kvm_io_device *mmio_dev;
1198         gpa_t                 gpa;
1199
1200         if (vcpu->mmio_read_completed) {
1201                 memcpy(val, vcpu->mmio_data, bytes);
1202                 vcpu->mmio_read_completed = 0;
1203                 return X86EMUL_CONTINUE;
1204         }
1205
1206         gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1207
1208         /* For APIC access vmexit */
1209         if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
1210                 goto mmio;
1211
1212         if (emulator_read_std(addr, val, bytes, vcpu)
1213                         == X86EMUL_CONTINUE)
1214                 return X86EMUL_CONTINUE;
1215         if (gpa == UNMAPPED_GVA)
1216                 return X86EMUL_PROPAGATE_FAULT;
1217
1218 mmio:
1219         /*
1220          * Is this MMIO handled locally?
1221          */
1222         mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1223         if (mmio_dev) {
1224                 kvm_iodevice_read(mmio_dev, gpa, bytes, val);
1225                 return X86EMUL_CONTINUE;
1226         }
1227
1228         vcpu->mmio_needed = 1;
1229         vcpu->mmio_phys_addr = gpa;
1230         vcpu->mmio_size = bytes;
1231         vcpu->mmio_is_write = 0;
1232
1233         return X86EMUL_UNHANDLEABLE;
1234 }
1235
1236 static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
1237                                const void *val, int bytes)
1238 {
1239         int ret;
1240
1241         ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
1242         if (ret < 0)
1243                 return 0;
1244         kvm_mmu_pte_write(vcpu, gpa, val, bytes);
1245         return 1;
1246 }
1247
1248 static int emulator_write_emulated_onepage(unsigned long addr,
1249                                            const void *val,
1250                                            unsigned int bytes,
1251                                            struct kvm_vcpu *vcpu)
1252 {
1253         struct kvm_io_device *mmio_dev;
1254         gpa_t                 gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1255
1256         if (gpa == UNMAPPED_GVA) {
1257                 kvm_x86_ops->inject_page_fault(vcpu, addr, 2);
1258                 return X86EMUL_PROPAGATE_FAULT;
1259         }
1260
1261         /* For APIC access vmexit */
1262         if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
1263                 goto mmio;
1264
1265         if (emulator_write_phys(vcpu, gpa, val, bytes))
1266                 return X86EMUL_CONTINUE;
1267
1268 mmio:
1269         /*
1270          * Is this MMIO handled locally?
1271          */
1272         mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1273         if (mmio_dev) {
1274                 kvm_iodevice_write(mmio_dev, gpa, bytes, val);
1275                 return X86EMUL_CONTINUE;
1276         }
1277
1278         vcpu->mmio_needed = 1;
1279         vcpu->mmio_phys_addr = gpa;
1280         vcpu->mmio_size = bytes;
1281         vcpu->mmio_is_write = 1;
1282         memcpy(vcpu->mmio_data, val, bytes);
1283
1284         return X86EMUL_CONTINUE;
1285 }
1286
1287 int emulator_write_emulated(unsigned long addr,
1288                                    const void *val,
1289                                    unsigned int bytes,
1290                                    struct kvm_vcpu *vcpu)
1291 {
1292         /* Crossing a page boundary? */
1293         if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
1294                 int rc, now;
1295
1296                 now = -addr & ~PAGE_MASK;
1297                 rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
1298                 if (rc != X86EMUL_CONTINUE)
1299                         return rc;
1300                 addr += now;
1301                 val += now;
1302                 bytes -= now;
1303         }
1304         return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
1305 }
1306 EXPORT_SYMBOL_GPL(emulator_write_emulated);
1307
1308 static int emulator_cmpxchg_emulated(unsigned long addr,
1309                                      const void *old,
1310                                      const void *new,
1311                                      unsigned int bytes,
1312                                      struct kvm_vcpu *vcpu)
1313 {
1314         static int reported;
1315
1316         if (!reported) {
1317                 reported = 1;
1318                 printk(KERN_WARNING "kvm: emulating exchange as write\n");
1319         }
1320         return emulator_write_emulated(addr, new, bytes, vcpu);
1321 }
1322
1323 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
1324 {
1325         return kvm_x86_ops->get_segment_base(vcpu, seg);
1326 }
1327
1328 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
1329 {
1330         return X86EMUL_CONTINUE;
1331 }
1332
1333 int emulate_clts(struct kvm_vcpu *vcpu)
1334 {
1335         kvm_x86_ops->set_cr0(vcpu, vcpu->cr0 & ~X86_CR0_TS);
1336         return X86EMUL_CONTINUE;
1337 }
1338
1339 int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
1340 {
1341         struct kvm_vcpu *vcpu = ctxt->vcpu;
1342
1343         switch (dr) {
1344         case 0 ... 3:
1345                 *dest = kvm_x86_ops->get_dr(vcpu, dr);
1346                 return X86EMUL_CONTINUE;
1347         default:
1348                 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __FUNCTION__, dr);
1349                 return X86EMUL_UNHANDLEABLE;
1350         }
1351 }
1352
1353 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
1354 {
1355         unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
1356         int exception;
1357
1358         kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
1359         if (exception) {
1360                 /* FIXME: better handling */
1361                 return X86EMUL_UNHANDLEABLE;
1362         }
1363         return X86EMUL_CONTINUE;
1364 }
1365
1366 void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
1367 {
1368         static int reported;
1369         u8 opcodes[4];
1370         unsigned long rip = vcpu->rip;
1371         unsigned long rip_linear;
1372
1373         rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
1374
1375         if (reported)
1376                 return;
1377
1378         emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu);
1379
1380         printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
1381                context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
1382         reported = 1;
1383 }
1384 EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
1385
1386 struct x86_emulate_ops emulate_ops = {
1387         .read_std            = emulator_read_std,
1388         .read_emulated       = emulator_read_emulated,
1389         .write_emulated      = emulator_write_emulated,
1390         .cmpxchg_emulated    = emulator_cmpxchg_emulated,
1391 };
1392
1393 int emulate_instruction(struct kvm_vcpu *vcpu,
1394                         struct kvm_run *run,
1395                         unsigned long cr2,
1396                         u16 error_code,
1397                         int no_decode)
1398 {
1399         int r;
1400
1401         vcpu->mmio_fault_cr2 = cr2;
1402         kvm_x86_ops->cache_regs(vcpu);
1403
1404         vcpu->mmio_is_write = 0;
1405         vcpu->pio.string = 0;
1406
1407         if (!no_decode) {
1408                 int cs_db, cs_l;
1409                 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
1410
1411                 vcpu->emulate_ctxt.vcpu = vcpu;
1412                 vcpu->emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
1413                 vcpu->emulate_ctxt.cr2 = cr2;
1414                 vcpu->emulate_ctxt.mode =
1415                         (vcpu->emulate_ctxt.eflags & X86_EFLAGS_VM)
1416                         ? X86EMUL_MODE_REAL : cs_l
1417                         ? X86EMUL_MODE_PROT64 : cs_db
1418                         ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
1419
1420                 if (vcpu->emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
1421                         vcpu->emulate_ctxt.cs_base = 0;
1422                         vcpu->emulate_ctxt.ds_base = 0;
1423                         vcpu->emulate_ctxt.es_base = 0;
1424                         vcpu->emulate_ctxt.ss_base = 0;
1425                 } else {
1426                         vcpu->emulate_ctxt.cs_base =
1427                                         get_segment_base(vcpu, VCPU_SREG_CS);
1428                         vcpu->emulate_ctxt.ds_base =
1429                                         get_segment_base(vcpu, VCPU_SREG_DS);
1430                         vcpu->emulate_ctxt.es_base =
1431                                         get_segment_base(vcpu, VCPU_SREG_ES);
1432                         vcpu->emulate_ctxt.ss_base =
1433                                         get_segment_base(vcpu, VCPU_SREG_SS);
1434                 }
1435
1436                 vcpu->emulate_ctxt.gs_base =
1437                                         get_segment_base(vcpu, VCPU_SREG_GS);
1438                 vcpu->emulate_ctxt.fs_base =
1439                                         get_segment_base(vcpu, VCPU_SREG_FS);
1440
1441                 r = x86_decode_insn(&vcpu->emulate_ctxt, &emulate_ops);
1442                 ++vcpu->stat.insn_emulation;
1443                 if (r)  {
1444                         ++vcpu->stat.insn_emulation_fail;
1445                         if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
1446                                 return EMULATE_DONE;
1447                         return EMULATE_FAIL;
1448                 }
1449         }
1450
1451         r = x86_emulate_insn(&vcpu->emulate_ctxt, &emulate_ops);
1452
1453         if (vcpu->pio.string)
1454                 return EMULATE_DO_MMIO;
1455
1456         if ((r || vcpu->mmio_is_write) && run) {
1457                 run->exit_reason = KVM_EXIT_MMIO;
1458                 run->mmio.phys_addr = vcpu->mmio_phys_addr;
1459                 memcpy(run->mmio.data, vcpu->mmio_data, 8);
1460                 run->mmio.len = vcpu->mmio_size;
1461                 run->mmio.is_write = vcpu->mmio_is_write;
1462         }
1463
1464         if (r) {
1465                 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
1466                         return EMULATE_DONE;
1467                 if (!vcpu->mmio_needed) {
1468                         kvm_report_emulation_failure(vcpu, "mmio");
1469                         return EMULATE_FAIL;
1470                 }
1471                 return EMULATE_DO_MMIO;
1472         }
1473
1474         kvm_x86_ops->decache_regs(vcpu);
1475         kvm_x86_ops->set_rflags(vcpu, vcpu->emulate_ctxt.eflags);
1476
1477         if (vcpu->mmio_is_write) {
1478                 vcpu->mmio_needed = 0;
1479                 return EMULATE_DO_MMIO;
1480         }
1481
1482         return EMULATE_DONE;
1483 }
1484 EXPORT_SYMBOL_GPL(emulate_instruction);
1485
1486 static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
1487 {
1488         int i;
1489
1490         for (i = 0; i < ARRAY_SIZE(vcpu->pio.guest_pages); ++i)
1491                 if (vcpu->pio.guest_pages[i]) {
1492                         kvm_release_page_dirty(vcpu->pio.guest_pages[i]);
1493                         vcpu->pio.guest_pages[i] = NULL;
1494                 }
1495 }
1496
1497 static int pio_copy_data(struct kvm_vcpu *vcpu)
1498 {
1499         void *p = vcpu->pio_data;
1500         void *q;
1501         unsigned bytes;
1502         int nr_pages = vcpu->pio.guest_pages[1] ? 2 : 1;
1503
1504         q = vmap(vcpu->pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
1505                  PAGE_KERNEL);
1506         if (!q) {
1507                 free_pio_guest_pages(vcpu);
1508                 return -ENOMEM;
1509         }
1510         q += vcpu->pio.guest_page_offset;
1511         bytes = vcpu->pio.size * vcpu->pio.cur_count;
1512         if (vcpu->pio.in)
1513                 memcpy(q, p, bytes);
1514         else
1515                 memcpy(p, q, bytes);
1516         q -= vcpu->pio.guest_page_offset;
1517         vunmap(q);
1518         free_pio_guest_pages(vcpu);
1519         return 0;
1520 }
1521
1522 int complete_pio(struct kvm_vcpu *vcpu)
1523 {
1524         struct kvm_pio_request *io = &vcpu->pio;
1525         long delta;
1526         int r;
1527
1528         kvm_x86_ops->cache_regs(vcpu);
1529
1530         if (!io->string) {
1531                 if (io->in)
1532                         memcpy(&vcpu->regs[VCPU_REGS_RAX], vcpu->pio_data,
1533                                io->size);
1534         } else {
1535                 if (io->in) {
1536                         r = pio_copy_data(vcpu);
1537                         if (r) {
1538                                 kvm_x86_ops->cache_regs(vcpu);
1539                                 return r;
1540                         }
1541                 }
1542
1543                 delta = 1;
1544                 if (io->rep) {
1545                         delta *= io->cur_count;
1546                         /*
1547                          * The size of the register should really depend on
1548                          * current address size.
1549                          */
1550                         vcpu->regs[VCPU_REGS_RCX] -= delta;
1551                 }
1552                 if (io->down)
1553                         delta = -delta;
1554                 delta *= io->size;
1555                 if (io->in)
1556                         vcpu->regs[VCPU_REGS_RDI] += delta;
1557                 else
1558                         vcpu->regs[VCPU_REGS_RSI] += delta;
1559         }
1560
1561         kvm_x86_ops->decache_regs(vcpu);
1562
1563         io->count -= io->cur_count;
1564         io->cur_count = 0;
1565
1566         return 0;
1567 }
1568
1569 static void kernel_pio(struct kvm_io_device *pio_dev,
1570                        struct kvm_vcpu *vcpu,
1571                        void *pd)
1572 {
1573         /* TODO: String I/O for in kernel device */
1574
1575         mutex_lock(&vcpu->kvm->lock);
1576         if (vcpu->pio.in)
1577                 kvm_iodevice_read(pio_dev, vcpu->pio.port,
1578                                   vcpu->pio.size,
1579                                   pd);
1580         else
1581                 kvm_iodevice_write(pio_dev, vcpu->pio.port,
1582                                    vcpu->pio.size,
1583                                    pd);
1584         mutex_unlock(&vcpu->kvm->lock);
1585 }
1586
1587 static void pio_string_write(struct kvm_io_device *pio_dev,
1588                              struct kvm_vcpu *vcpu)
1589 {
1590         struct kvm_pio_request *io = &vcpu->pio;
1591         void *pd = vcpu->pio_data;
1592         int i;
1593
1594         mutex_lock(&vcpu->kvm->lock);
1595         for (i = 0; i < io->cur_count; i++) {
1596                 kvm_iodevice_write(pio_dev, io->port,
1597                                    io->size,
1598                                    pd);
1599                 pd += io->size;
1600         }
1601         mutex_unlock(&vcpu->kvm->lock);
1602 }
1603
1604 static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
1605                                                gpa_t addr)
1606 {
1607         return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr);
1608 }
1609
1610 int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
1611                   int size, unsigned port)
1612 {
1613         struct kvm_io_device *pio_dev;
1614
1615         vcpu->run->exit_reason = KVM_EXIT_IO;
1616         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
1617         vcpu->run->io.size = vcpu->pio.size = size;
1618         vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
1619         vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = 1;
1620         vcpu->run->io.port = vcpu->pio.port = port;
1621         vcpu->pio.in = in;
1622         vcpu->pio.string = 0;
1623         vcpu->pio.down = 0;
1624         vcpu->pio.guest_page_offset = 0;
1625         vcpu->pio.rep = 0;
1626
1627         kvm_x86_ops->cache_regs(vcpu);
1628         memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4);
1629         kvm_x86_ops->decache_regs(vcpu);
1630
1631         kvm_x86_ops->skip_emulated_instruction(vcpu);
1632
1633         pio_dev = vcpu_find_pio_dev(vcpu, port);
1634         if (pio_dev) {
1635                 kernel_pio(pio_dev, vcpu, vcpu->pio_data);
1636                 complete_pio(vcpu);
1637                 return 1;
1638         }
1639         return 0;
1640 }
1641 EXPORT_SYMBOL_GPL(kvm_emulate_pio);
1642
1643 int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
1644                   int size, unsigned long count, int down,
1645                   gva_t address, int rep, unsigned port)
1646 {
1647         unsigned now, in_page;
1648         int i, ret = 0;
1649         int nr_pages = 1;
1650         struct page *page;
1651         struct kvm_io_device *pio_dev;
1652
1653         vcpu->run->exit_reason = KVM_EXIT_IO;
1654         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
1655         vcpu->run->io.size = vcpu->pio.size = size;
1656         vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
1657         vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = count;
1658         vcpu->run->io.port = vcpu->pio.port = port;
1659         vcpu->pio.in = in;
1660         vcpu->pio.string = 1;
1661         vcpu->pio.down = down;
1662         vcpu->pio.guest_page_offset = offset_in_page(address);
1663         vcpu->pio.rep = rep;
1664
1665         if (!count) {
1666                 kvm_x86_ops->skip_emulated_instruction(vcpu);
1667                 return 1;
1668         }
1669
1670         if (!down)
1671                 in_page = PAGE_SIZE - offset_in_page(address);
1672         else
1673                 in_page = offset_in_page(address) + size;
1674         now = min(count, (unsigned long)in_page / size);
1675         if (!now) {
1676                 /*
1677                  * String I/O straddles page boundary.  Pin two guest pages
1678                  * so that we satisfy atomicity constraints.  Do just one
1679                  * transaction to avoid complexity.
1680                  */
1681                 nr_pages = 2;
1682                 now = 1;
1683         }
1684         if (down) {
1685                 /*
1686                  * String I/O in reverse.  Yuck.  Kill the guest, fix later.
1687                  */
1688                 pr_unimpl(vcpu, "guest string pio down\n");
1689                 inject_gp(vcpu);
1690                 return 1;
1691         }
1692         vcpu->run->io.count = now;
1693         vcpu->pio.cur_count = now;
1694
1695         if (vcpu->pio.cur_count == vcpu->pio.count)
1696                 kvm_x86_ops->skip_emulated_instruction(vcpu);
1697
1698         for (i = 0; i < nr_pages; ++i) {
1699                 mutex_lock(&vcpu->kvm->lock);
1700                 page = gva_to_page(vcpu, address + i * PAGE_SIZE);
1701                 vcpu->pio.guest_pages[i] = page;
1702                 mutex_unlock(&vcpu->kvm->lock);
1703                 if (!page) {
1704                         inject_gp(vcpu);
1705                         free_pio_guest_pages(vcpu);
1706                         return 1;
1707                 }
1708         }
1709
1710         pio_dev = vcpu_find_pio_dev(vcpu, port);
1711         if (!vcpu->pio.in) {
1712                 /* string PIO write */
1713                 ret = pio_copy_data(vcpu);
1714                 if (ret >= 0 && pio_dev) {
1715                         pio_string_write(pio_dev, vcpu);
1716                         complete_pio(vcpu);
1717                         if (vcpu->pio.count == 0)
1718                                 ret = 1;
1719                 }
1720         } else if (pio_dev)
1721                 pr_unimpl(vcpu, "no string pio read support yet, "
1722                        "port %x size %d count %ld\n",
1723                         port, size, count);
1724
1725         return ret;
1726 }
1727 EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
1728
1729 int kvm_arch_init(void *opaque)
1730 {
1731         int r;
1732         struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
1733
1734         r = kvm_mmu_module_init();
1735         if (r)
1736                 goto out_fail;
1737
1738         kvm_init_msr_list();
1739
1740         if (kvm_x86_ops) {
1741                 printk(KERN_ERR "kvm: already loaded the other module\n");
1742                 r = -EEXIST;
1743                 goto out;
1744         }
1745
1746         if (!ops->cpu_has_kvm_support()) {
1747                 printk(KERN_ERR "kvm: no hardware support\n");
1748                 r = -EOPNOTSUPP;
1749                 goto out;
1750         }
1751         if (ops->disabled_by_bios()) {
1752                 printk(KERN_ERR "kvm: disabled by bios\n");
1753                 r = -EOPNOTSUPP;
1754                 goto out;
1755         }
1756
1757         kvm_x86_ops = ops;
1758         kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
1759         return 0;
1760
1761 out:
1762         kvm_mmu_module_exit();
1763 out_fail:
1764         return r;
1765 }
1766
1767 void kvm_arch_exit(void)
1768 {
1769         kvm_x86_ops = NULL;
1770         kvm_mmu_module_exit();
1771 }
1772
1773 int kvm_emulate_halt(struct kvm_vcpu *vcpu)
1774 {
1775         ++vcpu->stat.halt_exits;
1776         if (irqchip_in_kernel(vcpu->kvm)) {
1777                 vcpu->mp_state = VCPU_MP_STATE_HALTED;
1778                 kvm_vcpu_block(vcpu);
1779                 if (vcpu->mp_state != VCPU_MP_STATE_RUNNABLE)
1780                         return -EINTR;
1781                 return 1;
1782         } else {
1783                 vcpu->run->exit_reason = KVM_EXIT_HLT;
1784                 return 0;
1785         }
1786 }
1787 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
1788
1789 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
1790 {
1791         unsigned long nr, a0, a1, a2, a3, ret;
1792
1793         kvm_x86_ops->cache_regs(vcpu);
1794
1795         nr = vcpu->regs[VCPU_REGS_RAX];
1796         a0 = vcpu->regs[VCPU_REGS_RBX];
1797         a1 = vcpu->regs[VCPU_REGS_RCX];
1798         a2 = vcpu->regs[VCPU_REGS_RDX];
1799         a3 = vcpu->regs[VCPU_REGS_RSI];
1800
1801         if (!is_long_mode(vcpu)) {
1802                 nr &= 0xFFFFFFFF;
1803                 a0 &= 0xFFFFFFFF;
1804                 a1 &= 0xFFFFFFFF;
1805                 a2 &= 0xFFFFFFFF;
1806                 a3 &= 0xFFFFFFFF;
1807         }
1808
1809         switch (nr) {
1810         default:
1811                 ret = -KVM_ENOSYS;
1812                 break;
1813         }
1814         vcpu->regs[VCPU_REGS_RAX] = ret;
1815         kvm_x86_ops->decache_regs(vcpu);
1816         return 0;
1817 }
1818 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
1819
1820 int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
1821 {
1822         char instruction[3];
1823         int ret = 0;
1824
1825         mutex_lock(&vcpu->kvm->lock);
1826
1827         /*
1828          * Blow out the MMU to ensure that no other VCPU has an active mapping
1829          * to ensure that the updated hypercall appears atomically across all
1830          * VCPUs.
1831          */
1832         kvm_mmu_zap_all(vcpu->kvm);
1833
1834         kvm_x86_ops->cache_regs(vcpu);
1835         kvm_x86_ops->patch_hypercall(vcpu, instruction);
1836         if (emulator_write_emulated(vcpu->rip, instruction, 3, vcpu)
1837             != X86EMUL_CONTINUE)
1838                 ret = -EFAULT;
1839
1840         mutex_unlock(&vcpu->kvm->lock);
1841
1842         return ret;
1843 }
1844
1845 static u64 mk_cr_64(u64 curr_cr, u32 new_val)
1846 {
1847         return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
1848 }
1849
1850 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1851 {
1852         struct descriptor_table dt = { limit, base };
1853
1854         kvm_x86_ops->set_gdt(vcpu, &dt);
1855 }
1856
1857 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1858 {
1859         struct descriptor_table dt = { limit, base };
1860
1861         kvm_x86_ops->set_idt(vcpu, &dt);
1862 }
1863
1864 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
1865                    unsigned long *rflags)
1866 {
1867         lmsw(vcpu, msw);
1868         *rflags = kvm_x86_ops->get_rflags(vcpu);
1869 }
1870
1871 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
1872 {
1873         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
1874         switch (cr) {
1875         case 0:
1876                 return vcpu->cr0;
1877         case 2:
1878                 return vcpu->cr2;
1879         case 3:
1880                 return vcpu->cr3;
1881         case 4:
1882                 return vcpu->cr4;
1883         default:
1884                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1885                 return 0;
1886         }
1887 }
1888
1889 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
1890                      unsigned long *rflags)
1891 {
1892         switch (cr) {
1893         case 0:
1894                 set_cr0(vcpu, mk_cr_64(vcpu->cr0, val));
1895                 *rflags = kvm_x86_ops->get_rflags(vcpu);
1896                 break;
1897         case 2:
1898                 vcpu->cr2 = val;
1899                 break;
1900         case 3:
1901                 set_cr3(vcpu, val);
1902                 break;
1903         case 4:
1904                 set_cr4(vcpu, mk_cr_64(vcpu->cr4, val));
1905                 break;
1906         default:
1907                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1908         }
1909 }
1910
1911 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
1912 {
1913         int i;
1914         u32 function;
1915         struct kvm_cpuid_entry *e, *best;
1916
1917         kvm_x86_ops->cache_regs(vcpu);
1918         function = vcpu->regs[VCPU_REGS_RAX];
1919         vcpu->regs[VCPU_REGS_RAX] = 0;
1920         vcpu->regs[VCPU_REGS_RBX] = 0;
1921         vcpu->regs[VCPU_REGS_RCX] = 0;
1922         vcpu->regs[VCPU_REGS_RDX] = 0;
1923         best = NULL;
1924         for (i = 0; i < vcpu->cpuid_nent; ++i) {
1925                 e = &vcpu->cpuid_entries[i];
1926                 if (e->function == function) {
1927                         best = e;
1928                         break;
1929                 }
1930                 /*
1931                  * Both basic or both extended?
1932                  */
1933                 if (((e->function ^ function) & 0x80000000) == 0)
1934                         if (!best || e->function > best->function)
1935                                 best = e;
1936         }
1937         if (best) {
1938                 vcpu->regs[VCPU_REGS_RAX] = best->eax;
1939                 vcpu->regs[VCPU_REGS_RBX] = best->ebx;
1940                 vcpu->regs[VCPU_REGS_RCX] = best->ecx;
1941                 vcpu->regs[VCPU_REGS_RDX] = best->edx;
1942         }
1943         kvm_x86_ops->decache_regs(vcpu);
1944         kvm_x86_ops->skip_emulated_instruction(vcpu);
1945 }
1946 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
1947
1948 /*
1949  * Check if userspace requested an interrupt window, and that the
1950  * interrupt window is open.
1951  *
1952  * No need to exit to userspace if we already have an interrupt queued.
1953  */
1954 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
1955                                           struct kvm_run *kvm_run)
1956 {
1957         return (!vcpu->irq_summary &&
1958                 kvm_run->request_interrupt_window &&
1959                 vcpu->interrupt_window_open &&
1960                 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
1961 }
1962
1963 static void post_kvm_run_save(struct kvm_vcpu *vcpu,
1964                               struct kvm_run *kvm_run)
1965 {
1966         kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
1967         kvm_run->cr8 = get_cr8(vcpu);
1968         kvm_run->apic_base = kvm_get_apic_base(vcpu);
1969         if (irqchip_in_kernel(vcpu->kvm))
1970                 kvm_run->ready_for_interrupt_injection = 1;
1971         else
1972                 kvm_run->ready_for_interrupt_injection =
1973                                         (vcpu->interrupt_window_open &&
1974                                          vcpu->irq_summary == 0);
1975 }
1976
1977 static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1978 {
1979         int r;
1980
1981         if (unlikely(vcpu->mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) {
1982                 pr_debug("vcpu %d received sipi with vector # %x\n",
1983                        vcpu->vcpu_id, vcpu->sipi_vector);
1984                 kvm_lapic_reset(vcpu);
1985                 r = kvm_x86_ops->vcpu_reset(vcpu);
1986                 if (r)
1987                         return r;
1988                 vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
1989         }
1990
1991 preempted:
1992         if (vcpu->guest_debug.enabled)
1993                 kvm_x86_ops->guest_debug_pre(vcpu);
1994
1995 again:
1996         r = kvm_mmu_reload(vcpu);
1997         if (unlikely(r))
1998                 goto out;
1999
2000         kvm_inject_pending_timer_irqs(vcpu);
2001
2002         preempt_disable();
2003
2004         kvm_x86_ops->prepare_guest_switch(vcpu);
2005         kvm_load_guest_fpu(vcpu);
2006
2007         local_irq_disable();
2008
2009         if (signal_pending(current)) {
2010                 local_irq_enable();
2011                 preempt_enable();
2012                 r = -EINTR;
2013                 kvm_run->exit_reason = KVM_EXIT_INTR;
2014                 ++vcpu->stat.signal_exits;
2015                 goto out;
2016         }
2017
2018         if (irqchip_in_kernel(vcpu->kvm))
2019                 kvm_x86_ops->inject_pending_irq(vcpu);
2020         else if (!vcpu->mmio_read_completed)
2021                 kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
2022
2023         vcpu->guest_mode = 1;
2024         kvm_guest_enter();
2025
2026         if (vcpu->requests)
2027                 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
2028                         kvm_x86_ops->tlb_flush(vcpu);
2029
2030         kvm_x86_ops->run(vcpu, kvm_run);
2031
2032         vcpu->guest_mode = 0;
2033         local_irq_enable();
2034
2035         ++vcpu->stat.exits;
2036
2037         /*
2038          * We must have an instruction between local_irq_enable() and
2039          * kvm_guest_exit(), so the timer interrupt isn't delayed by
2040          * the interrupt shadow.  The stat.exits increment will do nicely.
2041          * But we need to prevent reordering, hence this barrier():
2042          */
2043         barrier();
2044
2045         kvm_guest_exit();
2046
2047         preempt_enable();
2048
2049         /*
2050          * Profile KVM exit RIPs:
2051          */
2052         if (unlikely(prof_on == KVM_PROFILING)) {
2053                 kvm_x86_ops->cache_regs(vcpu);
2054                 profile_hit(KVM_PROFILING, (void *)vcpu->rip);
2055         }
2056
2057         r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
2058
2059         if (r > 0) {
2060                 if (dm_request_for_irq_injection(vcpu, kvm_run)) {
2061                         r = -EINTR;
2062                         kvm_run->exit_reason = KVM_EXIT_INTR;
2063                         ++vcpu->stat.request_irq_exits;
2064                         goto out;
2065                 }
2066                 if (!need_resched())
2067                         goto again;
2068         }
2069
2070 out:
2071         if (r > 0) {
2072                 kvm_resched(vcpu);
2073                 goto preempted;
2074         }
2075
2076         post_kvm_run_save(vcpu, kvm_run);
2077
2078         return r;
2079 }
2080
2081 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2082 {
2083         int r;
2084         sigset_t sigsaved;
2085
2086         vcpu_load(vcpu);
2087
2088         if (unlikely(vcpu->mp_state == VCPU_MP_STATE_UNINITIALIZED)) {
2089                 kvm_vcpu_block(vcpu);
2090                 vcpu_put(vcpu);
2091                 return -EAGAIN;
2092         }
2093
2094         if (vcpu->sigset_active)
2095                 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
2096
2097         /* re-sync apic's tpr */
2098         if (!irqchip_in_kernel(vcpu->kvm))
2099                 set_cr8(vcpu, kvm_run->cr8);
2100
2101         if (vcpu->pio.cur_count) {
2102                 r = complete_pio(vcpu);
2103                 if (r)
2104                         goto out;
2105         }
2106 #if CONFIG_HAS_IOMEM
2107         if (vcpu->mmio_needed) {
2108                 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
2109                 vcpu->mmio_read_completed = 1;
2110                 vcpu->mmio_needed = 0;
2111                 r = emulate_instruction(vcpu, kvm_run,
2112                                         vcpu->mmio_fault_cr2, 0, 1);
2113                 if (r == EMULATE_DO_MMIO) {
2114                         /*
2115                          * Read-modify-write.  Back to userspace.
2116                          */
2117                         r = 0;
2118                         goto out;
2119                 }
2120         }
2121 #endif
2122         if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
2123                 kvm_x86_ops->cache_regs(vcpu);
2124                 vcpu->regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
2125                 kvm_x86_ops->decache_regs(vcpu);
2126         }
2127
2128         r = __vcpu_run(vcpu, kvm_run);
2129
2130 out:
2131         if (vcpu->sigset_active)
2132                 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
2133
2134         vcpu_put(vcpu);
2135         return r;
2136 }
2137
2138 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
2139 {
2140         vcpu_load(vcpu);
2141
2142         kvm_x86_ops->cache_regs(vcpu);
2143
2144         regs->rax = vcpu->regs[VCPU_REGS_RAX];
2145         regs->rbx = vcpu->regs[VCPU_REGS_RBX];
2146         regs->rcx = vcpu->regs[VCPU_REGS_RCX];
2147         regs->rdx = vcpu->regs[VCPU_REGS_RDX];
2148         regs->rsi = vcpu->regs[VCPU_REGS_RSI];
2149         regs->rdi = vcpu->regs[VCPU_REGS_RDI];
2150         regs->rsp = vcpu->regs[VCPU_REGS_RSP];
2151         regs->rbp = vcpu->regs[VCPU_REGS_RBP];
2152 #ifdef CONFIG_X86_64
2153         regs->r8 = vcpu->regs[VCPU_REGS_R8];
2154         regs->r9 = vcpu->regs[VCPU_REGS_R9];
2155         regs->r10 = vcpu->regs[VCPU_REGS_R10];
2156         regs->r11 = vcpu->regs[VCPU_REGS_R11];
2157         regs->r12 = vcpu->regs[VCPU_REGS_R12];
2158         regs->r13 = vcpu->regs[VCPU_REGS_R13];
2159         regs->r14 = vcpu->regs[VCPU_REGS_R14];
2160         regs->r15 = vcpu->regs[VCPU_REGS_R15];
2161 #endif
2162
2163         regs->rip = vcpu->rip;
2164         regs->rflags = kvm_x86_ops->get_rflags(vcpu);
2165
2166         /*
2167          * Don't leak debug flags in case they were set for guest debugging
2168          */
2169         if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
2170                 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
2171
2172         vcpu_put(vcpu);
2173
2174         return 0;
2175 }
2176
2177 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
2178 {
2179         vcpu_load(vcpu);
2180
2181         vcpu->regs[VCPU_REGS_RAX] = regs->rax;
2182         vcpu->regs[VCPU_REGS_RBX] = regs->rbx;
2183         vcpu->regs[VCPU_REGS_RCX] = regs->rcx;
2184         vcpu->regs[VCPU_REGS_RDX] = regs->rdx;
2185         vcpu->regs[VCPU_REGS_RSI] = regs->rsi;
2186         vcpu->regs[VCPU_REGS_RDI] = regs->rdi;
2187         vcpu->regs[VCPU_REGS_RSP] = regs->rsp;
2188         vcpu->regs[VCPU_REGS_RBP] = regs->rbp;
2189 #ifdef CONFIG_X86_64
2190         vcpu->regs[VCPU_REGS_R8] = regs->r8;
2191         vcpu->regs[VCPU_REGS_R9] = regs->r9;
2192         vcpu->regs[VCPU_REGS_R10] = regs->r10;
2193         vcpu->regs[VCPU_REGS_R11] = regs->r11;
2194         vcpu->regs[VCPU_REGS_R12] = regs->r12;
2195         vcpu->regs[VCPU_REGS_R13] = regs->r13;
2196         vcpu->regs[VCPU_REGS_R14] = regs->r14;
2197         vcpu->regs[VCPU_REGS_R15] = regs->r15;
2198 #endif
2199
2200         vcpu->rip = regs->rip;
2201         kvm_x86_ops->set_rflags(vcpu, regs->rflags);
2202
2203         kvm_x86_ops->decache_regs(vcpu);
2204
2205         vcpu_put(vcpu);
2206
2207         return 0;
2208 }
2209
2210 static void get_segment(struct kvm_vcpu *vcpu,
2211                         struct kvm_segment *var, int seg)
2212 {
2213         return kvm_x86_ops->get_segment(vcpu, var, seg);
2214 }
2215
2216 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
2217 {
2218         struct kvm_segment cs;
2219
2220         get_segment(vcpu, &cs, VCPU_SREG_CS);
2221         *db = cs.db;
2222         *l = cs.l;
2223 }
2224 EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
2225
2226 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
2227                                   struct kvm_sregs *sregs)
2228 {
2229         struct descriptor_table dt;
2230         int pending_vec;
2231
2232         vcpu_load(vcpu);
2233
2234         get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
2235         get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
2236         get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
2237         get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
2238         get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
2239         get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
2240
2241         get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
2242         get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
2243
2244         kvm_x86_ops->get_idt(vcpu, &dt);
2245         sregs->idt.limit = dt.limit;
2246         sregs->idt.base = dt.base;
2247         kvm_x86_ops->get_gdt(vcpu, &dt);
2248         sregs->gdt.limit = dt.limit;
2249         sregs->gdt.base = dt.base;
2250
2251         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2252         sregs->cr0 = vcpu->cr0;
2253         sregs->cr2 = vcpu->cr2;
2254         sregs->cr3 = vcpu->cr3;
2255         sregs->cr4 = vcpu->cr4;
2256         sregs->cr8 = get_cr8(vcpu);
2257         sregs->efer = vcpu->shadow_efer;
2258         sregs->apic_base = kvm_get_apic_base(vcpu);
2259
2260         if (irqchip_in_kernel(vcpu->kvm)) {
2261                 memset(sregs->interrupt_bitmap, 0,
2262                        sizeof sregs->interrupt_bitmap);
2263                 pending_vec = kvm_x86_ops->get_irq(vcpu);
2264                 if (pending_vec >= 0)
2265                         set_bit(pending_vec,
2266                                 (unsigned long *)sregs->interrupt_bitmap);
2267         } else
2268                 memcpy(sregs->interrupt_bitmap, vcpu->irq_pending,
2269                        sizeof sregs->interrupt_bitmap);
2270
2271         vcpu_put(vcpu);
2272
2273         return 0;
2274 }
2275
2276 static void set_segment(struct kvm_vcpu *vcpu,
2277                         struct kvm_segment *var, int seg)
2278 {
2279         return kvm_x86_ops->set_segment(vcpu, var, seg);
2280 }
2281
2282 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
2283                                   struct kvm_sregs *sregs)
2284 {
2285         int mmu_reset_needed = 0;
2286         int i, pending_vec, max_bits;
2287         struct descriptor_table dt;
2288
2289         vcpu_load(vcpu);
2290
2291         dt.limit = sregs->idt.limit;
2292         dt.base = sregs->idt.base;
2293         kvm_x86_ops->set_idt(vcpu, &dt);
2294         dt.limit = sregs->gdt.limit;
2295         dt.base = sregs->gdt.base;
2296         kvm_x86_ops->set_gdt(vcpu, &dt);
2297
2298         vcpu->cr2 = sregs->cr2;
2299         mmu_reset_needed |= vcpu->cr3 != sregs->cr3;
2300         vcpu->cr3 = sregs->cr3;
2301
2302         set_cr8(vcpu, sregs->cr8);
2303
2304         mmu_reset_needed |= vcpu->shadow_efer != sregs->efer;
2305 #ifdef CONFIG_X86_64
2306         kvm_x86_ops->set_efer(vcpu, sregs->efer);
2307 #endif
2308         kvm_set_apic_base(vcpu, sregs->apic_base);
2309
2310         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2311
2312         mmu_reset_needed |= vcpu->cr0 != sregs->cr0;
2313         vcpu->cr0 = sregs->cr0;
2314         kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
2315
2316         mmu_reset_needed |= vcpu->cr4 != sregs->cr4;
2317         kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
2318         if (!is_long_mode(vcpu) && is_pae(vcpu))
2319                 load_pdptrs(vcpu, vcpu->cr3);
2320
2321         if (mmu_reset_needed)
2322                 kvm_mmu_reset_context(vcpu);
2323
2324         if (!irqchip_in_kernel(vcpu->kvm)) {
2325                 memcpy(vcpu->irq_pending, sregs->interrupt_bitmap,
2326                        sizeof vcpu->irq_pending);
2327                 vcpu->irq_summary = 0;
2328                 for (i = 0; i < ARRAY_SIZE(vcpu->irq_pending); ++i)
2329                         if (vcpu->irq_pending[i])
2330                                 __set_bit(i, &vcpu->irq_summary);
2331         } else {
2332                 max_bits = (sizeof sregs->interrupt_bitmap) << 3;
2333                 pending_vec = find_first_bit(
2334                         (const unsigned long *)sregs->interrupt_bitmap,
2335                         max_bits);
2336                 /* Only pending external irq is handled here */
2337                 if (pending_vec < max_bits) {
2338                         kvm_x86_ops->set_irq(vcpu, pending_vec);
2339                         pr_debug("Set back pending irq %d\n",
2340                                  pending_vec);
2341                 }
2342         }
2343
2344         set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
2345         set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
2346         set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
2347         set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
2348         set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
2349         set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
2350
2351         set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
2352         set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
2353
2354         vcpu_put(vcpu);
2355
2356         return 0;
2357 }
2358
2359 int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
2360                                     struct kvm_debug_guest *dbg)
2361 {
2362         int r;
2363
2364         vcpu_load(vcpu);
2365
2366         r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
2367
2368         vcpu_put(vcpu);
2369
2370         return r;
2371 }
2372
2373 /*
2374  * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
2375  * we have asm/x86/processor.h
2376  */
2377 struct fxsave {
2378         u16     cwd;
2379         u16     swd;
2380         u16     twd;
2381         u16     fop;
2382         u64     rip;
2383         u64     rdp;
2384         u32     mxcsr;
2385         u32     mxcsr_mask;
2386         u32     st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
2387 #ifdef CONFIG_X86_64
2388         u32     xmm_space[64];  /* 16*16 bytes for each XMM-reg = 256 bytes */
2389 #else
2390         u32     xmm_space[32];  /* 8*16 bytes for each XMM-reg = 128 bytes */
2391 #endif
2392 };
2393
2394 /*
2395  * Translate a guest virtual address to a guest physical address.
2396  */
2397 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
2398                                     struct kvm_translation *tr)
2399 {
2400         unsigned long vaddr = tr->linear_address;
2401         gpa_t gpa;
2402
2403         vcpu_load(vcpu);
2404         mutex_lock(&vcpu->kvm->lock);
2405         gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
2406         tr->physical_address = gpa;
2407         tr->valid = gpa != UNMAPPED_GVA;
2408         tr->writeable = 1;
2409         tr->usermode = 0;
2410         mutex_unlock(&vcpu->kvm->lock);
2411         vcpu_put(vcpu);
2412
2413         return 0;
2414 }
2415
2416 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2417 {
2418         struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image;
2419
2420         vcpu_load(vcpu);
2421
2422         memcpy(fpu->fpr, fxsave->st_space, 128);
2423         fpu->fcw = fxsave->cwd;
2424         fpu->fsw = fxsave->swd;
2425         fpu->ftwx = fxsave->twd;
2426         fpu->last_opcode = fxsave->fop;
2427         fpu->last_ip = fxsave->rip;
2428         fpu->last_dp = fxsave->rdp;
2429         memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
2430
2431         vcpu_put(vcpu);
2432
2433         return 0;
2434 }
2435
2436 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2437 {
2438         struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image;
2439
2440         vcpu_load(vcpu);
2441
2442         memcpy(fxsave->st_space, fpu->fpr, 128);
2443         fxsave->cwd = fpu->fcw;
2444         fxsave->swd = fpu->fsw;
2445         fxsave->twd = fpu->ftwx;
2446         fxsave->fop = fpu->last_opcode;
2447         fxsave->rip = fpu->last_ip;
2448         fxsave->rdp = fpu->last_dp;
2449         memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
2450
2451         vcpu_put(vcpu);
2452
2453         return 0;
2454 }
2455
2456 void fx_init(struct kvm_vcpu *vcpu)
2457 {
2458         unsigned after_mxcsr_mask;
2459
2460         /* Initialize guest FPU by resetting ours and saving into guest's */
2461         preempt_disable();
2462         fx_save(&vcpu->host_fx_image);
2463         fpu_init();
2464         fx_save(&vcpu->guest_fx_image);
2465         fx_restore(&vcpu->host_fx_image);
2466         preempt_enable();
2467
2468         vcpu->cr0 |= X86_CR0_ET;
2469         after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
2470         vcpu->guest_fx_image.mxcsr = 0x1f80;
2471         memset((void *)&vcpu->guest_fx_image + after_mxcsr_mask,
2472                0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
2473 }
2474 EXPORT_SYMBOL_GPL(fx_init);
2475
2476 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
2477 {
2478         if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
2479                 return;
2480
2481         vcpu->guest_fpu_loaded = 1;
2482         fx_save(&vcpu->host_fx_image);
2483         fx_restore(&vcpu->guest_fx_image);
2484 }
2485 EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
2486
2487 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
2488 {
2489         if (!vcpu->guest_fpu_loaded)
2490                 return;
2491
2492         vcpu->guest_fpu_loaded = 0;
2493         fx_save(&vcpu->guest_fx_image);
2494         fx_restore(&vcpu->host_fx_image);
2495         ++vcpu->stat.fpu_reload;
2496 }
2497 EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
2498
2499 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
2500 {
2501         kvm_x86_ops->vcpu_free(vcpu);
2502 }
2503
2504 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
2505                                                 unsigned int id)
2506 {
2507         return kvm_x86_ops->vcpu_create(kvm, id);
2508 }
2509
2510 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
2511 {
2512         int r;
2513
2514         /* We do fxsave: this must be aligned. */
2515         BUG_ON((unsigned long)&vcpu->host_fx_image & 0xF);
2516
2517         vcpu_load(vcpu);
2518         r = kvm_arch_vcpu_reset(vcpu);
2519         if (r == 0)
2520                 r = kvm_mmu_setup(vcpu);
2521         vcpu_put(vcpu);
2522         if (r < 0)
2523                 goto free_vcpu;
2524
2525         return 0;
2526 free_vcpu:
2527         kvm_x86_ops->vcpu_free(vcpu);
2528         return r;
2529 }
2530
2531 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
2532 {
2533         vcpu_load(vcpu);
2534         kvm_mmu_unload(vcpu);
2535         vcpu_put(vcpu);
2536
2537         kvm_x86_ops->vcpu_free(vcpu);
2538 }
2539
2540 int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
2541 {
2542         return kvm_x86_ops->vcpu_reset(vcpu);
2543 }
2544
2545 void kvm_arch_hardware_enable(void *garbage)
2546 {
2547         kvm_x86_ops->hardware_enable(garbage);
2548 }
2549
2550 void kvm_arch_hardware_disable(void *garbage)
2551 {
2552         kvm_x86_ops->hardware_disable(garbage);
2553 }
2554
2555 int kvm_arch_hardware_setup(void)
2556 {
2557         return kvm_x86_ops->hardware_setup();
2558 }
2559
2560 void kvm_arch_hardware_unsetup(void)
2561 {
2562         kvm_x86_ops->hardware_unsetup();
2563 }
2564
2565 void kvm_arch_check_processor_compat(void *rtn)
2566 {
2567         kvm_x86_ops->check_processor_compatibility(rtn);
2568 }
2569
2570 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
2571 {
2572         struct page *page;
2573         struct kvm *kvm;
2574         int r;
2575
2576         BUG_ON(vcpu->kvm == NULL);
2577         kvm = vcpu->kvm;
2578
2579         vcpu->mmu.root_hpa = INVALID_PAGE;
2580         if (!irqchip_in_kernel(kvm) || vcpu->vcpu_id == 0)
2581                 vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
2582         else
2583                 vcpu->mp_state = VCPU_MP_STATE_UNINITIALIZED;
2584
2585         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
2586         if (!page) {
2587                 r = -ENOMEM;
2588                 goto fail;
2589         }
2590         vcpu->pio_data = page_address(page);
2591
2592         r = kvm_mmu_create(vcpu);
2593         if (r < 0)
2594                 goto fail_free_pio_data;
2595
2596         if (irqchip_in_kernel(kvm)) {
2597                 r = kvm_create_lapic(vcpu);
2598                 if (r < 0)
2599                         goto fail_mmu_destroy;
2600         }
2601
2602         return 0;
2603
2604 fail_mmu_destroy:
2605         kvm_mmu_destroy(vcpu);
2606 fail_free_pio_data:
2607         free_page((unsigned long)vcpu->pio_data);
2608 fail:
2609         return r;
2610 }
2611
2612 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
2613 {
2614         kvm_free_lapic(vcpu);
2615         kvm_mmu_destroy(vcpu);
2616         free_page((unsigned long)vcpu->pio_data);
2617 }
2618
2619 struct  kvm *kvm_arch_create_vm(void)
2620 {
2621         struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
2622
2623         if (!kvm)
2624                 return ERR_PTR(-ENOMEM);
2625
2626         INIT_LIST_HEAD(&kvm->active_mmu_pages);
2627
2628         return kvm;
2629 }
2630
2631 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
2632 {
2633         vcpu_load(vcpu);
2634         kvm_mmu_unload(vcpu);
2635         vcpu_put(vcpu);
2636 }
2637
2638 static void kvm_free_vcpus(struct kvm *kvm)
2639 {
2640         unsigned int i;
2641
2642         /*
2643          * Unpin any mmu pages first.
2644          */
2645         for (i = 0; i < KVM_MAX_VCPUS; ++i)
2646                 if (kvm->vcpus[i])
2647                         kvm_unload_vcpu_mmu(kvm->vcpus[i]);
2648         for (i = 0; i < KVM_MAX_VCPUS; ++i) {
2649                 if (kvm->vcpus[i]) {
2650                         kvm_arch_vcpu_free(kvm->vcpus[i]);
2651                         kvm->vcpus[i] = NULL;
2652                 }
2653         }
2654
2655 }
2656
2657 void kvm_arch_destroy_vm(struct kvm *kvm)
2658 {
2659         kfree(kvm->vpic);
2660         kfree(kvm->vioapic);
2661         kvm_free_vcpus(kvm);
2662         kvm_free_physmem(kvm);
2663         kfree(kvm);
2664 }
2665
2666 int kvm_arch_set_memory_region(struct kvm *kvm,
2667                                 struct kvm_userspace_memory_region *mem,
2668                                 struct kvm_memory_slot old,
2669                                 int user_alloc)
2670 {
2671         int npages = mem->memory_size >> PAGE_SHIFT;
2672         struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
2673
2674         /*To keep backward compatibility with older userspace,
2675          *x86 needs to hanlde !user_alloc case.
2676          */
2677         if (!user_alloc) {
2678                 if (npages && !old.rmap) {
2679                         down_write(&current->mm->mmap_sem);
2680                         memslot->userspace_addr = do_mmap(NULL, 0,
2681                                                      npages * PAGE_SIZE,
2682                                                      PROT_READ | PROT_WRITE,
2683                                                      MAP_SHARED | MAP_ANONYMOUS,
2684                                                      0);
2685                         up_write(&current->mm->mmap_sem);
2686
2687                         if (IS_ERR((void *)memslot->userspace_addr))
2688                                 return PTR_ERR((void *)memslot->userspace_addr);
2689                 } else {
2690                         if (!old.user_alloc && old.rmap) {
2691                                 int ret;
2692
2693                                 down_write(&current->mm->mmap_sem);
2694                                 ret = do_munmap(current->mm, old.userspace_addr,
2695                                                 old.npages * PAGE_SIZE);
2696                                 up_write(&current->mm->mmap_sem);
2697                                 if (ret < 0)
2698                                         printk(KERN_WARNING
2699                                        "kvm_vm_ioctl_set_memory_region: "
2700                                        "failed to munmap memory\n");
2701                         }
2702                 }
2703         }
2704
2705         if (!kvm->n_requested_mmu_pages) {
2706                 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
2707                 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
2708         }
2709
2710         kvm_mmu_slot_remove_write_access(kvm, mem->slot);
2711         kvm_flush_remote_tlbs(kvm);
2712
2713         return 0;
2714 }