]> err.no Git - linux-2.6/blob - drivers/kvm/kvm_main.c
KVM: Portability: split kvm_vcpu_ioctl
[linux-2.6] / drivers / kvm / kvm_main.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  *
9  * Authors:
10  *   Avi Kivity   <avi@qumranet.com>
11  *   Yaniv Kamay  <yaniv@qumranet.com>
12  *
13  * This work is licensed under the terms of the GNU GPL, version 2.  See
14  * the COPYING file in the top-level directory.
15  *
16  */
17
18 #include "kvm.h"
19 #include "x86.h"
20 #include "x86_emulate.h"
21 #include "segment_descriptor.h"
22 #include "irq.h"
23
24 #include <linux/kvm.h>
25 #include <linux/module.h>
26 #include <linux/errno.h>
27 #include <linux/percpu.h>
28 #include <linux/gfp.h>
29 #include <linux/mm.h>
30 #include <linux/miscdevice.h>
31 #include <linux/vmalloc.h>
32 #include <linux/reboot.h>
33 #include <linux/debugfs.h>
34 #include <linux/highmem.h>
35 #include <linux/file.h>
36 #include <linux/sysdev.h>
37 #include <linux/cpu.h>
38 #include <linux/sched.h>
39 #include <linux/cpumask.h>
40 #include <linux/smp.h>
41 #include <linux/anon_inodes.h>
42 #include <linux/profile.h>
43 #include <linux/kvm_para.h>
44 #include <linux/pagemap.h>
45
46 #include <asm/processor.h>
47 #include <asm/msr.h>
48 #include <asm/io.h>
49 #include <asm/uaccess.h>
50 #include <asm/desc.h>
51
52 MODULE_AUTHOR("Qumranet");
53 MODULE_LICENSE("GPL");
54
55 static DEFINE_SPINLOCK(kvm_lock);
56 static LIST_HEAD(vm_list);
57
58 static cpumask_t cpus_hardware_enabled;
59
60 struct kvm_x86_ops *kvm_x86_ops;
61 struct kmem_cache *kvm_vcpu_cache;
62 EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
63
64 static __read_mostly struct preempt_ops kvm_preempt_ops;
65
66 #define STAT_OFFSET(x) offsetof(struct kvm_vcpu, stat.x)
67
68 static struct kvm_stats_debugfs_item {
69         const char *name;
70         int offset;
71         struct dentry *dentry;
72 } debugfs_entries[] = {
73         { "pf_fixed", STAT_OFFSET(pf_fixed) },
74         { "pf_guest", STAT_OFFSET(pf_guest) },
75         { "tlb_flush", STAT_OFFSET(tlb_flush) },
76         { "invlpg", STAT_OFFSET(invlpg) },
77         { "exits", STAT_OFFSET(exits) },
78         { "io_exits", STAT_OFFSET(io_exits) },
79         { "mmio_exits", STAT_OFFSET(mmio_exits) },
80         { "signal_exits", STAT_OFFSET(signal_exits) },
81         { "irq_window", STAT_OFFSET(irq_window_exits) },
82         { "halt_exits", STAT_OFFSET(halt_exits) },
83         { "halt_wakeup", STAT_OFFSET(halt_wakeup) },
84         { "request_irq", STAT_OFFSET(request_irq_exits) },
85         { "irq_exits", STAT_OFFSET(irq_exits) },
86         { "light_exits", STAT_OFFSET(light_exits) },
87         { "efer_reload", STAT_OFFSET(efer_reload) },
88         { NULL }
89 };
90
91 static struct dentry *debugfs_dir;
92
93 #define CR0_RESERVED_BITS                                               \
94         (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
95                           | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
96                           | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
97 #define CR4_RESERVED_BITS                                               \
98         (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
99                           | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE     \
100                           | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR  \
101                           | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
102
103 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
104 #define EFER_RESERVED_BITS 0xfffffffffffff2fe
105
106 #ifdef CONFIG_X86_64
107 /* LDT or TSS descriptor in the GDT. 16 bytes. */
108 struct segment_descriptor_64 {
109         struct segment_descriptor s;
110         u32 base_higher;
111         u32 pad_zero;
112 };
113
114 #endif
115
116 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
117                            unsigned long arg);
118
119 unsigned long segment_base(u16 selector)
120 {
121         struct descriptor_table gdt;
122         struct segment_descriptor *d;
123         unsigned long table_base;
124         unsigned long v;
125
126         if (selector == 0)
127                 return 0;
128
129         asm("sgdt %0" : "=m"(gdt));
130         table_base = gdt.base;
131
132         if (selector & 4) {           /* from ldt */
133                 u16 ldt_selector;
134
135                 asm("sldt %0" : "=g"(ldt_selector));
136                 table_base = segment_base(ldt_selector);
137         }
138         d = (struct segment_descriptor *)(table_base + (selector & ~7));
139         v = d->base_low | ((unsigned long)d->base_mid << 16) |
140                 ((unsigned long)d->base_high << 24);
141 #ifdef CONFIG_X86_64
142         if (d->system == 0 && (d->type == 2 || d->type == 9 || d->type == 11))
143                 v |= ((unsigned long) \
144                       ((struct segment_descriptor_64 *)d)->base_higher) << 32;
145 #endif
146         return v;
147 }
148 EXPORT_SYMBOL_GPL(segment_base);
149
150 static inline int valid_vcpu(int n)
151 {
152         return likely(n >= 0 && n < KVM_MAX_VCPUS);
153 }
154
155 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
156 {
157         if (!vcpu->fpu_active || vcpu->guest_fpu_loaded)
158                 return;
159
160         vcpu->guest_fpu_loaded = 1;
161         fx_save(&vcpu->host_fx_image);
162         fx_restore(&vcpu->guest_fx_image);
163 }
164 EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
165
166 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
167 {
168         if (!vcpu->guest_fpu_loaded)
169                 return;
170
171         vcpu->guest_fpu_loaded = 0;
172         fx_save(&vcpu->guest_fx_image);
173         fx_restore(&vcpu->host_fx_image);
174 }
175 EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
176
177 /*
178  * Switches to specified vcpu, until a matching vcpu_put()
179  */
180 void vcpu_load(struct kvm_vcpu *vcpu)
181 {
182         int cpu;
183
184         mutex_lock(&vcpu->mutex);
185         cpu = get_cpu();
186         preempt_notifier_register(&vcpu->preempt_notifier);
187         kvm_arch_vcpu_load(vcpu, cpu);
188         put_cpu();
189 }
190
191 void vcpu_put(struct kvm_vcpu *vcpu)
192 {
193         preempt_disable();
194         kvm_arch_vcpu_put(vcpu);
195         preempt_notifier_unregister(&vcpu->preempt_notifier);
196         preempt_enable();
197         mutex_unlock(&vcpu->mutex);
198 }
199
200 static void ack_flush(void *_completed)
201 {
202 }
203
204 void kvm_flush_remote_tlbs(struct kvm *kvm)
205 {
206         int i, cpu;
207         cpumask_t cpus;
208         struct kvm_vcpu *vcpu;
209
210         cpus_clear(cpus);
211         for (i = 0; i < KVM_MAX_VCPUS; ++i) {
212                 vcpu = kvm->vcpus[i];
213                 if (!vcpu)
214                         continue;
215                 if (test_and_set_bit(KVM_TLB_FLUSH, &vcpu->requests))
216                         continue;
217                 cpu = vcpu->cpu;
218                 if (cpu != -1 && cpu != raw_smp_processor_id())
219                         cpu_set(cpu, cpus);
220         }
221         smp_call_function_mask(cpus, ack_flush, NULL, 1);
222 }
223
224 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
225 {
226         struct page *page;
227         int r;
228
229         mutex_init(&vcpu->mutex);
230         vcpu->cpu = -1;
231         vcpu->mmu.root_hpa = INVALID_PAGE;
232         vcpu->kvm = kvm;
233         vcpu->vcpu_id = id;
234         if (!irqchip_in_kernel(kvm) || id == 0)
235                 vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
236         else
237                 vcpu->mp_state = VCPU_MP_STATE_UNINITIALIZED;
238         init_waitqueue_head(&vcpu->wq);
239
240         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
241         if (!page) {
242                 r = -ENOMEM;
243                 goto fail;
244         }
245         vcpu->run = page_address(page);
246
247         page = alloc_page(GFP_KERNEL | __GFP_ZERO);
248         if (!page) {
249                 r = -ENOMEM;
250                 goto fail_free_run;
251         }
252         vcpu->pio_data = page_address(page);
253
254         r = kvm_mmu_create(vcpu);
255         if (r < 0)
256                 goto fail_free_pio_data;
257
258         if (irqchip_in_kernel(kvm)) {
259                 r = kvm_create_lapic(vcpu);
260                 if (r < 0)
261                         goto fail_mmu_destroy;
262         }
263
264         return 0;
265
266 fail_mmu_destroy:
267         kvm_mmu_destroy(vcpu);
268 fail_free_pio_data:
269         free_page((unsigned long)vcpu->pio_data);
270 fail_free_run:
271         free_page((unsigned long)vcpu->run);
272 fail:
273         return r;
274 }
275 EXPORT_SYMBOL_GPL(kvm_vcpu_init);
276
277 void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
278 {
279         kvm_free_lapic(vcpu);
280         kvm_mmu_destroy(vcpu);
281         free_page((unsigned long)vcpu->pio_data);
282         free_page((unsigned long)vcpu->run);
283 }
284 EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);
285
286 static struct kvm *kvm_create_vm(void)
287 {
288         struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
289
290         if (!kvm)
291                 return ERR_PTR(-ENOMEM);
292
293         kvm_io_bus_init(&kvm->pio_bus);
294         mutex_init(&kvm->lock);
295         INIT_LIST_HEAD(&kvm->active_mmu_pages);
296         kvm_io_bus_init(&kvm->mmio_bus);
297         spin_lock(&kvm_lock);
298         list_add(&kvm->vm_list, &vm_list);
299         spin_unlock(&kvm_lock);
300         return kvm;
301 }
302
303 static void kvm_free_userspace_physmem(struct kvm_memory_slot *free)
304 {
305         int i;
306
307         for (i = 0; i < free->npages; ++i) {
308                 if (free->phys_mem[i]) {
309                         if (!PageReserved(free->phys_mem[i]))
310                                 SetPageDirty(free->phys_mem[i]);
311                         page_cache_release(free->phys_mem[i]);
312                 }
313         }
314 }
315
316 static void kvm_free_kernel_physmem(struct kvm_memory_slot *free)
317 {
318         int i;
319
320         for (i = 0; i < free->npages; ++i)
321                 if (free->phys_mem[i])
322                         __free_page(free->phys_mem[i]);
323 }
324
325 /*
326  * Free any memory in @free but not in @dont.
327  */
328 static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
329                                   struct kvm_memory_slot *dont)
330 {
331         if (!dont || free->phys_mem != dont->phys_mem)
332                 if (free->phys_mem) {
333                         if (free->user_alloc)
334                                 kvm_free_userspace_physmem(free);
335                         else
336                                 kvm_free_kernel_physmem(free);
337                         vfree(free->phys_mem);
338                 }
339         if (!dont || free->rmap != dont->rmap)
340                 vfree(free->rmap);
341
342         if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
343                 vfree(free->dirty_bitmap);
344
345         free->phys_mem = NULL;
346         free->npages = 0;
347         free->dirty_bitmap = NULL;
348 }
349
350 static void kvm_free_physmem(struct kvm *kvm)
351 {
352         int i;
353
354         for (i = 0; i < kvm->nmemslots; ++i)
355                 kvm_free_physmem_slot(&kvm->memslots[i], NULL);
356 }
357
358 static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
359 {
360         int i;
361
362         for (i = 0; i < ARRAY_SIZE(vcpu->pio.guest_pages); ++i)
363                 if (vcpu->pio.guest_pages[i]) {
364                         __free_page(vcpu->pio.guest_pages[i]);
365                         vcpu->pio.guest_pages[i] = NULL;
366                 }
367 }
368
369 static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
370 {
371         vcpu_load(vcpu);
372         kvm_mmu_unload(vcpu);
373         vcpu_put(vcpu);
374 }
375
376 static void kvm_free_vcpus(struct kvm *kvm)
377 {
378         unsigned int i;
379
380         /*
381          * Unpin any mmu pages first.
382          */
383         for (i = 0; i < KVM_MAX_VCPUS; ++i)
384                 if (kvm->vcpus[i])
385                         kvm_unload_vcpu_mmu(kvm->vcpus[i]);
386         for (i = 0; i < KVM_MAX_VCPUS; ++i) {
387                 if (kvm->vcpus[i]) {
388                         kvm_x86_ops->vcpu_free(kvm->vcpus[i]);
389                         kvm->vcpus[i] = NULL;
390                 }
391         }
392
393 }
394
395 static void kvm_destroy_vm(struct kvm *kvm)
396 {
397         spin_lock(&kvm_lock);
398         list_del(&kvm->vm_list);
399         spin_unlock(&kvm_lock);
400         kvm_io_bus_destroy(&kvm->pio_bus);
401         kvm_io_bus_destroy(&kvm->mmio_bus);
402         kfree(kvm->vpic);
403         kfree(kvm->vioapic);
404         kvm_free_vcpus(kvm);
405         kvm_free_physmem(kvm);
406         kfree(kvm);
407 }
408
409 static int kvm_vm_release(struct inode *inode, struct file *filp)
410 {
411         struct kvm *kvm = filp->private_data;
412
413         kvm_destroy_vm(kvm);
414         return 0;
415 }
416
417 static void inject_gp(struct kvm_vcpu *vcpu)
418 {
419         kvm_x86_ops->inject_gp(vcpu, 0);
420 }
421
422 /*
423  * Load the pae pdptrs.  Return true is they are all valid.
424  */
425 static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
426 {
427         gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
428         unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
429         int i;
430         int ret;
431         u64 pdpte[ARRAY_SIZE(vcpu->pdptrs)];
432
433         mutex_lock(&vcpu->kvm->lock);
434         ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte,
435                                   offset * sizeof(u64), sizeof(pdpte));
436         if (ret < 0) {
437                 ret = 0;
438                 goto out;
439         }
440         for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
441                 if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
442                         ret = 0;
443                         goto out;
444                 }
445         }
446         ret = 1;
447
448         memcpy(vcpu->pdptrs, pdpte, sizeof(vcpu->pdptrs));
449 out:
450         mutex_unlock(&vcpu->kvm->lock);
451
452         return ret;
453 }
454
455 void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
456 {
457         if (cr0 & CR0_RESERVED_BITS) {
458                 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n",
459                        cr0, vcpu->cr0);
460                 inject_gp(vcpu);
461                 return;
462         }
463
464         if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
465                 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
466                 inject_gp(vcpu);
467                 return;
468         }
469
470         if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
471                 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
472                        "and a clear PE flag\n");
473                 inject_gp(vcpu);
474                 return;
475         }
476
477         if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
478 #ifdef CONFIG_X86_64
479                 if ((vcpu->shadow_efer & EFER_LME)) {
480                         int cs_db, cs_l;
481
482                         if (!is_pae(vcpu)) {
483                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
484                                        "in long mode while PAE is disabled\n");
485                                 inject_gp(vcpu);
486                                 return;
487                         }
488                         kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
489                         if (cs_l) {
490                                 printk(KERN_DEBUG "set_cr0: #GP, start paging "
491                                        "in long mode while CS.L == 1\n");
492                                 inject_gp(vcpu);
493                                 return;
494
495                         }
496                 } else
497 #endif
498                 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) {
499                         printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
500                                "reserved bits\n");
501                         inject_gp(vcpu);
502                         return;
503                 }
504
505         }
506
507         kvm_x86_ops->set_cr0(vcpu, cr0);
508         vcpu->cr0 = cr0;
509
510         mutex_lock(&vcpu->kvm->lock);
511         kvm_mmu_reset_context(vcpu);
512         mutex_unlock(&vcpu->kvm->lock);
513         return;
514 }
515 EXPORT_SYMBOL_GPL(set_cr0);
516
517 void lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
518 {
519         set_cr0(vcpu, (vcpu->cr0 & ~0x0ful) | (msw & 0x0f));
520 }
521 EXPORT_SYMBOL_GPL(lmsw);
522
523 void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
524 {
525         if (cr4 & CR4_RESERVED_BITS) {
526                 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
527                 inject_gp(vcpu);
528                 return;
529         }
530
531         if (is_long_mode(vcpu)) {
532                 if (!(cr4 & X86_CR4_PAE)) {
533                         printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
534                                "in long mode\n");
535                         inject_gp(vcpu);
536                         return;
537                 }
538         } else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & X86_CR4_PAE)
539                    && !load_pdptrs(vcpu, vcpu->cr3)) {
540                 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
541                 inject_gp(vcpu);
542                 return;
543         }
544
545         if (cr4 & X86_CR4_VMXE) {
546                 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
547                 inject_gp(vcpu);
548                 return;
549         }
550         kvm_x86_ops->set_cr4(vcpu, cr4);
551         vcpu->cr4 = cr4;
552         mutex_lock(&vcpu->kvm->lock);
553         kvm_mmu_reset_context(vcpu);
554         mutex_unlock(&vcpu->kvm->lock);
555 }
556 EXPORT_SYMBOL_GPL(set_cr4);
557
558 void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
559 {
560         if (is_long_mode(vcpu)) {
561                 if (cr3 & CR3_L_MODE_RESERVED_BITS) {
562                         printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
563                         inject_gp(vcpu);
564                         return;
565                 }
566         } else {
567                 if (is_pae(vcpu)) {
568                         if (cr3 & CR3_PAE_RESERVED_BITS) {
569                                 printk(KERN_DEBUG
570                                        "set_cr3: #GP, reserved bits\n");
571                                 inject_gp(vcpu);
572                                 return;
573                         }
574                         if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
575                                 printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
576                                        "reserved bits\n");
577                                 inject_gp(vcpu);
578                                 return;
579                         }
580                 }
581                 /*
582                  * We don't check reserved bits in nonpae mode, because
583                  * this isn't enforced, and VMware depends on this.
584                  */
585         }
586
587         mutex_lock(&vcpu->kvm->lock);
588         /*
589          * Does the new cr3 value map to physical memory? (Note, we
590          * catch an invalid cr3 even in real-mode, because it would
591          * cause trouble later on when we turn on paging anyway.)
592          *
593          * A real CPU would silently accept an invalid cr3 and would
594          * attempt to use it - with largely undefined (and often hard
595          * to debug) behavior on the guest side.
596          */
597         if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
598                 inject_gp(vcpu);
599         else {
600                 vcpu->cr3 = cr3;
601                 vcpu->mmu.new_cr3(vcpu);
602         }
603         mutex_unlock(&vcpu->kvm->lock);
604 }
605 EXPORT_SYMBOL_GPL(set_cr3);
606
607 void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
608 {
609         if (cr8 & CR8_RESERVED_BITS) {
610                 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
611                 inject_gp(vcpu);
612                 return;
613         }
614         if (irqchip_in_kernel(vcpu->kvm))
615                 kvm_lapic_set_tpr(vcpu, cr8);
616         else
617                 vcpu->cr8 = cr8;
618 }
619 EXPORT_SYMBOL_GPL(set_cr8);
620
621 unsigned long get_cr8(struct kvm_vcpu *vcpu)
622 {
623         if (irqchip_in_kernel(vcpu->kvm))
624                 return kvm_lapic_get_cr8(vcpu);
625         else
626                 return vcpu->cr8;
627 }
628 EXPORT_SYMBOL_GPL(get_cr8);
629
630 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
631 {
632         if (irqchip_in_kernel(vcpu->kvm))
633                 return vcpu->apic_base;
634         else
635                 return vcpu->apic_base;
636 }
637 EXPORT_SYMBOL_GPL(kvm_get_apic_base);
638
639 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
640 {
641         /* TODO: reserve bits check */
642         if (irqchip_in_kernel(vcpu->kvm))
643                 kvm_lapic_set_base(vcpu, data);
644         else
645                 vcpu->apic_base = data;
646 }
647 EXPORT_SYMBOL_GPL(kvm_set_apic_base);
648
649 void fx_init(struct kvm_vcpu *vcpu)
650 {
651         unsigned after_mxcsr_mask;
652
653         /* Initialize guest FPU by resetting ours and saving into guest's */
654         preempt_disable();
655         fx_save(&vcpu->host_fx_image);
656         fpu_init();
657         fx_save(&vcpu->guest_fx_image);
658         fx_restore(&vcpu->host_fx_image);
659         preempt_enable();
660
661         vcpu->cr0 |= X86_CR0_ET;
662         after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
663         vcpu->guest_fx_image.mxcsr = 0x1f80;
664         memset((void *)&vcpu->guest_fx_image + after_mxcsr_mask,
665                0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
666 }
667 EXPORT_SYMBOL_GPL(fx_init);
668
669 /*
670  * Allocate some memory and give it an address in the guest physical address
671  * space.
672  *
673  * Discontiguous memory is allowed, mostly for framebuffers.
674  */
675 static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
676                                           struct
677                                           kvm_userspace_memory_region *mem,
678                                           int user_alloc)
679 {
680         int r;
681         gfn_t base_gfn;
682         unsigned long npages;
683         unsigned long i;
684         struct kvm_memory_slot *memslot;
685         struct kvm_memory_slot old, new;
686
687         r = -EINVAL;
688         /* General sanity checks */
689         if (mem->memory_size & (PAGE_SIZE - 1))
690                 goto out;
691         if (mem->guest_phys_addr & (PAGE_SIZE - 1))
692                 goto out;
693         if (mem->slot >= KVM_MEMORY_SLOTS)
694                 goto out;
695         if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
696                 goto out;
697
698         memslot = &kvm->memslots[mem->slot];
699         base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
700         npages = mem->memory_size >> PAGE_SHIFT;
701
702         if (!npages)
703                 mem->flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
704
705         mutex_lock(&kvm->lock);
706
707         new = old = *memslot;
708
709         new.base_gfn = base_gfn;
710         new.npages = npages;
711         new.flags = mem->flags;
712
713         /* Disallow changing a memory slot's size. */
714         r = -EINVAL;
715         if (npages && old.npages && npages != old.npages)
716                 goto out_unlock;
717
718         /* Check for overlaps */
719         r = -EEXIST;
720         for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
721                 struct kvm_memory_slot *s = &kvm->memslots[i];
722
723                 if (s == memslot)
724                         continue;
725                 if (!((base_gfn + npages <= s->base_gfn) ||
726                       (base_gfn >= s->base_gfn + s->npages)))
727                         goto out_unlock;
728         }
729
730         /* Deallocate if slot is being removed */
731         if (!npages)
732                 new.phys_mem = NULL;
733
734         /* Free page dirty bitmap if unneeded */
735         if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
736                 new.dirty_bitmap = NULL;
737
738         r = -ENOMEM;
739
740         /* Allocate if a slot is being created */
741         if (npages && !new.phys_mem) {
742                 new.phys_mem = vmalloc(npages * sizeof(struct page *));
743
744                 if (!new.phys_mem)
745                         goto out_unlock;
746
747                 new.rmap = vmalloc(npages * sizeof(struct page *));
748
749                 if (!new.rmap)
750                         goto out_unlock;
751
752                 memset(new.phys_mem, 0, npages * sizeof(struct page *));
753                 memset(new.rmap, 0, npages * sizeof(*new.rmap));
754                 if (user_alloc) {
755                         unsigned long pages_num;
756
757                         new.user_alloc = 1;
758                         down_read(&current->mm->mmap_sem);
759
760                         pages_num = get_user_pages(current, current->mm,
761                                                    mem->userspace_addr,
762                                                    npages, 1, 1, new.phys_mem,
763                                                    NULL);
764
765                         up_read(&current->mm->mmap_sem);
766                         if (pages_num != npages)
767                                 goto out_unlock;
768                 } else {
769                         for (i = 0; i < npages; ++i) {
770                                 new.phys_mem[i] = alloc_page(GFP_HIGHUSER
771                                                              | __GFP_ZERO);
772                                 if (!new.phys_mem[i])
773                                         goto out_unlock;
774                         }
775                 }
776         }
777
778         /* Allocate page dirty bitmap if needed */
779         if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
780                 unsigned dirty_bytes = ALIGN(npages, BITS_PER_LONG) / 8;
781
782                 new.dirty_bitmap = vmalloc(dirty_bytes);
783                 if (!new.dirty_bitmap)
784                         goto out_unlock;
785                 memset(new.dirty_bitmap, 0, dirty_bytes);
786         }
787
788         if (mem->slot >= kvm->nmemslots)
789                 kvm->nmemslots = mem->slot + 1;
790
791         if (!kvm->n_requested_mmu_pages) {
792                 unsigned int n_pages;
793
794                 if (npages) {
795                         n_pages = npages * KVM_PERMILLE_MMU_PAGES / 1000;
796                         kvm_mmu_change_mmu_pages(kvm, kvm->n_alloc_mmu_pages +
797                                                  n_pages);
798                 } else {
799                         unsigned int nr_mmu_pages;
800
801                         n_pages = old.npages * KVM_PERMILLE_MMU_PAGES / 1000;
802                         nr_mmu_pages = kvm->n_alloc_mmu_pages - n_pages;
803                         nr_mmu_pages = max(nr_mmu_pages,
804                                         (unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
805                         kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
806                 }
807         }
808
809         *memslot = new;
810
811         kvm_mmu_slot_remove_write_access(kvm, mem->slot);
812         kvm_flush_remote_tlbs(kvm);
813
814         mutex_unlock(&kvm->lock);
815
816         kvm_free_physmem_slot(&old, &new);
817         return 0;
818
819 out_unlock:
820         mutex_unlock(&kvm->lock);
821         kvm_free_physmem_slot(&new, &old);
822 out:
823         return r;
824 }
825
826 static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
827                                           u32 kvm_nr_mmu_pages)
828 {
829         if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
830                 return -EINVAL;
831
832         mutex_lock(&kvm->lock);
833
834         kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
835         kvm->n_requested_mmu_pages = kvm_nr_mmu_pages;
836
837         mutex_unlock(&kvm->lock);
838         return 0;
839 }
840
841 static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
842 {
843         return kvm->n_alloc_mmu_pages;
844 }
845
846 /*
847  * Get (and clear) the dirty memory log for a memory slot.
848  */
849 static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
850                                       struct kvm_dirty_log *log)
851 {
852         struct kvm_memory_slot *memslot;
853         int r, i;
854         int n;
855         unsigned long any = 0;
856
857         mutex_lock(&kvm->lock);
858
859         r = -EINVAL;
860         if (log->slot >= KVM_MEMORY_SLOTS)
861                 goto out;
862
863         memslot = &kvm->memslots[log->slot];
864         r = -ENOENT;
865         if (!memslot->dirty_bitmap)
866                 goto out;
867
868         n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
869
870         for (i = 0; !any && i < n/sizeof(long); ++i)
871                 any = memslot->dirty_bitmap[i];
872
873         r = -EFAULT;
874         if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n))
875                 goto out;
876
877         /* If nothing is dirty, don't bother messing with page tables. */
878         if (any) {
879                 kvm_mmu_slot_remove_write_access(kvm, log->slot);
880                 kvm_flush_remote_tlbs(kvm);
881                 memset(memslot->dirty_bitmap, 0, n);
882         }
883
884         r = 0;
885
886 out:
887         mutex_unlock(&kvm->lock);
888         return r;
889 }
890
891 /*
892  * Set a new alias region.  Aliases map a portion of physical memory into
893  * another portion.  This is useful for memory windows, for example the PC
894  * VGA region.
895  */
896 static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
897                                          struct kvm_memory_alias *alias)
898 {
899         int r, n;
900         struct kvm_mem_alias *p;
901
902         r = -EINVAL;
903         /* General sanity checks */
904         if (alias->memory_size & (PAGE_SIZE - 1))
905                 goto out;
906         if (alias->guest_phys_addr & (PAGE_SIZE - 1))
907                 goto out;
908         if (alias->slot >= KVM_ALIAS_SLOTS)
909                 goto out;
910         if (alias->guest_phys_addr + alias->memory_size
911             < alias->guest_phys_addr)
912                 goto out;
913         if (alias->target_phys_addr + alias->memory_size
914             < alias->target_phys_addr)
915                 goto out;
916
917         mutex_lock(&kvm->lock);
918
919         p = &kvm->aliases[alias->slot];
920         p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
921         p->npages = alias->memory_size >> PAGE_SHIFT;
922         p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
923
924         for (n = KVM_ALIAS_SLOTS; n > 0; --n)
925                 if (kvm->aliases[n - 1].npages)
926                         break;
927         kvm->naliases = n;
928
929         kvm_mmu_zap_all(kvm);
930
931         mutex_unlock(&kvm->lock);
932
933         return 0;
934
935 out:
936         return r;
937 }
938
939 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
940 {
941         int r;
942
943         r = 0;
944         switch (chip->chip_id) {
945         case KVM_IRQCHIP_PIC_MASTER:
946                 memcpy(&chip->chip.pic,
947                         &pic_irqchip(kvm)->pics[0],
948                         sizeof(struct kvm_pic_state));
949                 break;
950         case KVM_IRQCHIP_PIC_SLAVE:
951                 memcpy(&chip->chip.pic,
952                         &pic_irqchip(kvm)->pics[1],
953                         sizeof(struct kvm_pic_state));
954                 break;
955         case KVM_IRQCHIP_IOAPIC:
956                 memcpy(&chip->chip.ioapic,
957                         ioapic_irqchip(kvm),
958                         sizeof(struct kvm_ioapic_state));
959                 break;
960         default:
961                 r = -EINVAL;
962                 break;
963         }
964         return r;
965 }
966
967 static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
968 {
969         int r;
970
971         r = 0;
972         switch (chip->chip_id) {
973         case KVM_IRQCHIP_PIC_MASTER:
974                 memcpy(&pic_irqchip(kvm)->pics[0],
975                         &chip->chip.pic,
976                         sizeof(struct kvm_pic_state));
977                 break;
978         case KVM_IRQCHIP_PIC_SLAVE:
979                 memcpy(&pic_irqchip(kvm)->pics[1],
980                         &chip->chip.pic,
981                         sizeof(struct kvm_pic_state));
982                 break;
983         case KVM_IRQCHIP_IOAPIC:
984                 memcpy(ioapic_irqchip(kvm),
985                         &chip->chip.ioapic,
986                         sizeof(struct kvm_ioapic_state));
987                 break;
988         default:
989                 r = -EINVAL;
990                 break;
991         }
992         kvm_pic_update_irq(pic_irqchip(kvm));
993         return r;
994 }
995
996 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
997 {
998         int i;
999         struct kvm_mem_alias *alias;
1000
1001         for (i = 0; i < kvm->naliases; ++i) {
1002                 alias = &kvm->aliases[i];
1003                 if (gfn >= alias->base_gfn
1004                     && gfn < alias->base_gfn + alias->npages)
1005                         return alias->target_gfn + gfn - alias->base_gfn;
1006         }
1007         return gfn;
1008 }
1009
1010 static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
1011 {
1012         int i;
1013
1014         for (i = 0; i < kvm->nmemslots; ++i) {
1015                 struct kvm_memory_slot *memslot = &kvm->memslots[i];
1016
1017                 if (gfn >= memslot->base_gfn
1018                     && gfn < memslot->base_gfn + memslot->npages)
1019                         return memslot;
1020         }
1021         return NULL;
1022 }
1023
1024 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
1025 {
1026         gfn = unalias_gfn(kvm, gfn);
1027         return __gfn_to_memslot(kvm, gfn);
1028 }
1029
1030 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
1031 {
1032         struct kvm_memory_slot *slot;
1033
1034         gfn = unalias_gfn(kvm, gfn);
1035         slot = __gfn_to_memslot(kvm, gfn);
1036         if (!slot)
1037                 return NULL;
1038         return slot->phys_mem[gfn - slot->base_gfn];
1039 }
1040 EXPORT_SYMBOL_GPL(gfn_to_page);
1041
1042 static int next_segment(unsigned long len, int offset)
1043 {
1044         if (len > PAGE_SIZE - offset)
1045                 return PAGE_SIZE - offset;
1046         else
1047                 return len;
1048 }
1049
1050 int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
1051                         int len)
1052 {
1053         void *page_virt;
1054         struct page *page;
1055
1056         page = gfn_to_page(kvm, gfn);
1057         if (!page)
1058                 return -EFAULT;
1059         page_virt = kmap_atomic(page, KM_USER0);
1060
1061         memcpy(data, page_virt + offset, len);
1062
1063         kunmap_atomic(page_virt, KM_USER0);
1064         return 0;
1065 }
1066 EXPORT_SYMBOL_GPL(kvm_read_guest_page);
1067
1068 int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len)
1069 {
1070         gfn_t gfn = gpa >> PAGE_SHIFT;
1071         int seg;
1072         int offset = offset_in_page(gpa);
1073         int ret;
1074
1075         while ((seg = next_segment(len, offset)) != 0) {
1076                 ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
1077                 if (ret < 0)
1078                         return ret;
1079                 offset = 0;
1080                 len -= seg;
1081                 data += seg;
1082                 ++gfn;
1083         }
1084         return 0;
1085 }
1086 EXPORT_SYMBOL_GPL(kvm_read_guest);
1087
1088 int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
1089                          int offset, int len)
1090 {
1091         void *page_virt;
1092         struct page *page;
1093
1094         page = gfn_to_page(kvm, gfn);
1095         if (!page)
1096                 return -EFAULT;
1097         page_virt = kmap_atomic(page, KM_USER0);
1098
1099         memcpy(page_virt + offset, data, len);
1100
1101         kunmap_atomic(page_virt, KM_USER0);
1102         mark_page_dirty(kvm, gfn);
1103         return 0;
1104 }
1105 EXPORT_SYMBOL_GPL(kvm_write_guest_page);
1106
1107 int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data,
1108                     unsigned long len)
1109 {
1110         gfn_t gfn = gpa >> PAGE_SHIFT;
1111         int seg;
1112         int offset = offset_in_page(gpa);
1113         int ret;
1114
1115         while ((seg = next_segment(len, offset)) != 0) {
1116                 ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
1117                 if (ret < 0)
1118                         return ret;
1119                 offset = 0;
1120                 len -= seg;
1121                 data += seg;
1122                 ++gfn;
1123         }
1124         return 0;
1125 }
1126
1127 int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
1128 {
1129         void *page_virt;
1130         struct page *page;
1131
1132         page = gfn_to_page(kvm, gfn);
1133         if (!page)
1134                 return -EFAULT;
1135         page_virt = kmap_atomic(page, KM_USER0);
1136
1137         memset(page_virt + offset, 0, len);
1138
1139         kunmap_atomic(page_virt, KM_USER0);
1140         return 0;
1141 }
1142 EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
1143
1144 int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len)
1145 {
1146         gfn_t gfn = gpa >> PAGE_SHIFT;
1147         int seg;
1148         int offset = offset_in_page(gpa);
1149         int ret;
1150
1151         while ((seg = next_segment(len, offset)) != 0) {
1152                 ret = kvm_clear_guest_page(kvm, gfn, offset, seg);
1153                 if (ret < 0)
1154                         return ret;
1155                 offset = 0;
1156                 len -= seg;
1157                 ++gfn;
1158         }
1159         return 0;
1160 }
1161 EXPORT_SYMBOL_GPL(kvm_clear_guest);
1162
1163 /* WARNING: Does not work on aliased pages. */
1164 void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
1165 {
1166         struct kvm_memory_slot *memslot;
1167
1168         memslot = __gfn_to_memslot(kvm, gfn);
1169         if (memslot && memslot->dirty_bitmap) {
1170                 unsigned long rel_gfn = gfn - memslot->base_gfn;
1171
1172                 /* avoid RMW */
1173                 if (!test_bit(rel_gfn, memslot->dirty_bitmap))
1174                         set_bit(rel_gfn, memslot->dirty_bitmap);
1175         }
1176 }
1177
1178 int emulator_read_std(unsigned long addr,
1179                              void *val,
1180                              unsigned int bytes,
1181                              struct kvm_vcpu *vcpu)
1182 {
1183         void *data = val;
1184
1185         while (bytes) {
1186                 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1187                 unsigned offset = addr & (PAGE_SIZE-1);
1188                 unsigned tocopy = min(bytes, (unsigned)PAGE_SIZE - offset);
1189                 int ret;
1190
1191                 if (gpa == UNMAPPED_GVA)
1192                         return X86EMUL_PROPAGATE_FAULT;
1193                 ret = kvm_read_guest(vcpu->kvm, gpa, data, tocopy);
1194                 if (ret < 0)
1195                         return X86EMUL_UNHANDLEABLE;
1196
1197                 bytes -= tocopy;
1198                 data += tocopy;
1199                 addr += tocopy;
1200         }
1201
1202         return X86EMUL_CONTINUE;
1203 }
1204 EXPORT_SYMBOL_GPL(emulator_read_std);
1205
1206 static int emulator_write_std(unsigned long addr,
1207                               const void *val,
1208                               unsigned int bytes,
1209                               struct kvm_vcpu *vcpu)
1210 {
1211         pr_unimpl(vcpu, "emulator_write_std: addr %lx n %d\n", addr, bytes);
1212         return X86EMUL_UNHANDLEABLE;
1213 }
1214
1215 /*
1216  * Only apic need an MMIO device hook, so shortcut now..
1217  */
1218 static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
1219                                                 gpa_t addr)
1220 {
1221         struct kvm_io_device *dev;
1222
1223         if (vcpu->apic) {
1224                 dev = &vcpu->apic->dev;
1225                 if (dev->in_range(dev, addr))
1226                         return dev;
1227         }
1228         return NULL;
1229 }
1230
1231 static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
1232                                                 gpa_t addr)
1233 {
1234         struct kvm_io_device *dev;
1235
1236         dev = vcpu_find_pervcpu_dev(vcpu, addr);
1237         if (dev == NULL)
1238                 dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr);
1239         return dev;
1240 }
1241
1242 static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
1243                                                gpa_t addr)
1244 {
1245         return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr);
1246 }
1247
1248 static int emulator_read_emulated(unsigned long addr,
1249                                   void *val,
1250                                   unsigned int bytes,
1251                                   struct kvm_vcpu *vcpu)
1252 {
1253         struct kvm_io_device *mmio_dev;
1254         gpa_t                 gpa;
1255
1256         if (vcpu->mmio_read_completed) {
1257                 memcpy(val, vcpu->mmio_data, bytes);
1258                 vcpu->mmio_read_completed = 0;
1259                 return X86EMUL_CONTINUE;
1260         } else if (emulator_read_std(addr, val, bytes, vcpu)
1261                    == X86EMUL_CONTINUE)
1262                 return X86EMUL_CONTINUE;
1263
1264         gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1265         if (gpa == UNMAPPED_GVA)
1266                 return X86EMUL_PROPAGATE_FAULT;
1267
1268         /*
1269          * Is this MMIO handled locally?
1270          */
1271         mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1272         if (mmio_dev) {
1273                 kvm_iodevice_read(mmio_dev, gpa, bytes, val);
1274                 return X86EMUL_CONTINUE;
1275         }
1276
1277         vcpu->mmio_needed = 1;
1278         vcpu->mmio_phys_addr = gpa;
1279         vcpu->mmio_size = bytes;
1280         vcpu->mmio_is_write = 0;
1281
1282         return X86EMUL_UNHANDLEABLE;
1283 }
1284
1285 static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
1286                                const void *val, int bytes)
1287 {
1288         int ret;
1289
1290         ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
1291         if (ret < 0)
1292                 return 0;
1293         kvm_mmu_pte_write(vcpu, gpa, val, bytes);
1294         return 1;
1295 }
1296
1297 static int emulator_write_emulated_onepage(unsigned long addr,
1298                                            const void *val,
1299                                            unsigned int bytes,
1300                                            struct kvm_vcpu *vcpu)
1301 {
1302         struct kvm_io_device *mmio_dev;
1303         gpa_t                 gpa = vcpu->mmu.gva_to_gpa(vcpu, addr);
1304
1305         if (gpa == UNMAPPED_GVA) {
1306                 kvm_x86_ops->inject_page_fault(vcpu, addr, 2);
1307                 return X86EMUL_PROPAGATE_FAULT;
1308         }
1309
1310         if (emulator_write_phys(vcpu, gpa, val, bytes))
1311                 return X86EMUL_CONTINUE;
1312
1313         /*
1314          * Is this MMIO handled locally?
1315          */
1316         mmio_dev = vcpu_find_mmio_dev(vcpu, gpa);
1317         if (mmio_dev) {
1318                 kvm_iodevice_write(mmio_dev, gpa, bytes, val);
1319                 return X86EMUL_CONTINUE;
1320         }
1321
1322         vcpu->mmio_needed = 1;
1323         vcpu->mmio_phys_addr = gpa;
1324         vcpu->mmio_size = bytes;
1325         vcpu->mmio_is_write = 1;
1326         memcpy(vcpu->mmio_data, val, bytes);
1327
1328         return X86EMUL_CONTINUE;
1329 }
1330
1331 int emulator_write_emulated(unsigned long addr,
1332                                    const void *val,
1333                                    unsigned int bytes,
1334                                    struct kvm_vcpu *vcpu)
1335 {
1336         /* Crossing a page boundary? */
1337         if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
1338                 int rc, now;
1339
1340                 now = -addr & ~PAGE_MASK;
1341                 rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
1342                 if (rc != X86EMUL_CONTINUE)
1343                         return rc;
1344                 addr += now;
1345                 val += now;
1346                 bytes -= now;
1347         }
1348         return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
1349 }
1350 EXPORT_SYMBOL_GPL(emulator_write_emulated);
1351
1352 static int emulator_cmpxchg_emulated(unsigned long addr,
1353                                      const void *old,
1354                                      const void *new,
1355                                      unsigned int bytes,
1356                                      struct kvm_vcpu *vcpu)
1357 {
1358         static int reported;
1359
1360         if (!reported) {
1361                 reported = 1;
1362                 printk(KERN_WARNING "kvm: emulating exchange as write\n");
1363         }
1364         return emulator_write_emulated(addr, new, bytes, vcpu);
1365 }
1366
1367 static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
1368 {
1369         return kvm_x86_ops->get_segment_base(vcpu, seg);
1370 }
1371
1372 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
1373 {
1374         return X86EMUL_CONTINUE;
1375 }
1376
1377 int emulate_clts(struct kvm_vcpu *vcpu)
1378 {
1379         kvm_x86_ops->set_cr0(vcpu, vcpu->cr0 & ~X86_CR0_TS);
1380         return X86EMUL_CONTINUE;
1381 }
1382
1383 int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
1384 {
1385         struct kvm_vcpu *vcpu = ctxt->vcpu;
1386
1387         switch (dr) {
1388         case 0 ... 3:
1389                 *dest = kvm_x86_ops->get_dr(vcpu, dr);
1390                 return X86EMUL_CONTINUE;
1391         default:
1392                 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __FUNCTION__, dr);
1393                 return X86EMUL_UNHANDLEABLE;
1394         }
1395 }
1396
1397 int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
1398 {
1399         unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
1400         int exception;
1401
1402         kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception);
1403         if (exception) {
1404                 /* FIXME: better handling */
1405                 return X86EMUL_UNHANDLEABLE;
1406         }
1407         return X86EMUL_CONTINUE;
1408 }
1409
1410 void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
1411 {
1412         static int reported;
1413         u8 opcodes[4];
1414         unsigned long rip = vcpu->rip;
1415         unsigned long rip_linear;
1416
1417         rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
1418
1419         if (reported)
1420                 return;
1421
1422         emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu);
1423
1424         printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
1425                context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
1426         reported = 1;
1427 }
1428 EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
1429
1430 struct x86_emulate_ops emulate_ops = {
1431         .read_std            = emulator_read_std,
1432         .write_std           = emulator_write_std,
1433         .read_emulated       = emulator_read_emulated,
1434         .write_emulated      = emulator_write_emulated,
1435         .cmpxchg_emulated    = emulator_cmpxchg_emulated,
1436 };
1437
1438 int emulate_instruction(struct kvm_vcpu *vcpu,
1439                         struct kvm_run *run,
1440                         unsigned long cr2,
1441                         u16 error_code,
1442                         int no_decode)
1443 {
1444         int r;
1445
1446         vcpu->mmio_fault_cr2 = cr2;
1447         kvm_x86_ops->cache_regs(vcpu);
1448
1449         vcpu->mmio_is_write = 0;
1450         vcpu->pio.string = 0;
1451
1452         if (!no_decode) {
1453                 int cs_db, cs_l;
1454                 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
1455
1456                 vcpu->emulate_ctxt.vcpu = vcpu;
1457                 vcpu->emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
1458                 vcpu->emulate_ctxt.cr2 = cr2;
1459                 vcpu->emulate_ctxt.mode =
1460                         (vcpu->emulate_ctxt.eflags & X86_EFLAGS_VM)
1461                         ? X86EMUL_MODE_REAL : cs_l
1462                         ? X86EMUL_MODE_PROT64 : cs_db
1463                         ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
1464
1465                 if (vcpu->emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
1466                         vcpu->emulate_ctxt.cs_base = 0;
1467                         vcpu->emulate_ctxt.ds_base = 0;
1468                         vcpu->emulate_ctxt.es_base = 0;
1469                         vcpu->emulate_ctxt.ss_base = 0;
1470                 } else {
1471                         vcpu->emulate_ctxt.cs_base =
1472                                         get_segment_base(vcpu, VCPU_SREG_CS);
1473                         vcpu->emulate_ctxt.ds_base =
1474                                         get_segment_base(vcpu, VCPU_SREG_DS);
1475                         vcpu->emulate_ctxt.es_base =
1476                                         get_segment_base(vcpu, VCPU_SREG_ES);
1477                         vcpu->emulate_ctxt.ss_base =
1478                                         get_segment_base(vcpu, VCPU_SREG_SS);
1479                 }
1480
1481                 vcpu->emulate_ctxt.gs_base =
1482                                         get_segment_base(vcpu, VCPU_SREG_GS);
1483                 vcpu->emulate_ctxt.fs_base =
1484                                         get_segment_base(vcpu, VCPU_SREG_FS);
1485
1486                 r = x86_decode_insn(&vcpu->emulate_ctxt, &emulate_ops);
1487                 if (r)  {
1488                         if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
1489                                 return EMULATE_DONE;
1490                         return EMULATE_FAIL;
1491                 }
1492         }
1493
1494         r = x86_emulate_insn(&vcpu->emulate_ctxt, &emulate_ops);
1495
1496         if (vcpu->pio.string)
1497                 return EMULATE_DO_MMIO;
1498
1499         if ((r || vcpu->mmio_is_write) && run) {
1500                 run->exit_reason = KVM_EXIT_MMIO;
1501                 run->mmio.phys_addr = vcpu->mmio_phys_addr;
1502                 memcpy(run->mmio.data, vcpu->mmio_data, 8);
1503                 run->mmio.len = vcpu->mmio_size;
1504                 run->mmio.is_write = vcpu->mmio_is_write;
1505         }
1506
1507         if (r) {
1508                 if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
1509                         return EMULATE_DONE;
1510                 if (!vcpu->mmio_needed) {
1511                         kvm_report_emulation_failure(vcpu, "mmio");
1512                         return EMULATE_FAIL;
1513                 }
1514                 return EMULATE_DO_MMIO;
1515         }
1516
1517         kvm_x86_ops->decache_regs(vcpu);
1518         kvm_x86_ops->set_rflags(vcpu, vcpu->emulate_ctxt.eflags);
1519
1520         if (vcpu->mmio_is_write) {
1521                 vcpu->mmio_needed = 0;
1522                 return EMULATE_DO_MMIO;
1523         }
1524
1525         return EMULATE_DONE;
1526 }
1527 EXPORT_SYMBOL_GPL(emulate_instruction);
1528
1529 /*
1530  * The vCPU has executed a HLT instruction with in-kernel mode enabled.
1531  */
1532 static void kvm_vcpu_block(struct kvm_vcpu *vcpu)
1533 {
1534         DECLARE_WAITQUEUE(wait, current);
1535
1536         add_wait_queue(&vcpu->wq, &wait);
1537
1538         /*
1539          * We will block until either an interrupt or a signal wakes us up
1540          */
1541         while (!kvm_cpu_has_interrupt(vcpu)
1542                && !signal_pending(current)
1543                && vcpu->mp_state != VCPU_MP_STATE_RUNNABLE
1544                && vcpu->mp_state != VCPU_MP_STATE_SIPI_RECEIVED) {
1545                 set_current_state(TASK_INTERRUPTIBLE);
1546                 vcpu_put(vcpu);
1547                 schedule();
1548                 vcpu_load(vcpu);
1549         }
1550
1551         __set_current_state(TASK_RUNNING);
1552         remove_wait_queue(&vcpu->wq, &wait);
1553 }
1554
1555 int kvm_emulate_halt(struct kvm_vcpu *vcpu)
1556 {
1557         ++vcpu->stat.halt_exits;
1558         if (irqchip_in_kernel(vcpu->kvm)) {
1559                 vcpu->mp_state = VCPU_MP_STATE_HALTED;
1560                 kvm_vcpu_block(vcpu);
1561                 if (vcpu->mp_state != VCPU_MP_STATE_RUNNABLE)
1562                         return -EINTR;
1563                 return 1;
1564         } else {
1565                 vcpu->run->exit_reason = KVM_EXIT_HLT;
1566                 return 0;
1567         }
1568 }
1569 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
1570
1571 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
1572 {
1573         unsigned long nr, a0, a1, a2, a3, ret;
1574
1575         kvm_x86_ops->cache_regs(vcpu);
1576
1577         nr = vcpu->regs[VCPU_REGS_RAX];
1578         a0 = vcpu->regs[VCPU_REGS_RBX];
1579         a1 = vcpu->regs[VCPU_REGS_RCX];
1580         a2 = vcpu->regs[VCPU_REGS_RDX];
1581         a3 = vcpu->regs[VCPU_REGS_RSI];
1582
1583         if (!is_long_mode(vcpu)) {
1584                 nr &= 0xFFFFFFFF;
1585                 a0 &= 0xFFFFFFFF;
1586                 a1 &= 0xFFFFFFFF;
1587                 a2 &= 0xFFFFFFFF;
1588                 a3 &= 0xFFFFFFFF;
1589         }
1590
1591         switch (nr) {
1592         default:
1593                 ret = -KVM_ENOSYS;
1594                 break;
1595         }
1596         vcpu->regs[VCPU_REGS_RAX] = ret;
1597         kvm_x86_ops->decache_regs(vcpu);
1598         return 0;
1599 }
1600 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
1601
1602 int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
1603 {
1604         char instruction[3];
1605         int ret = 0;
1606
1607         mutex_lock(&vcpu->kvm->lock);
1608
1609         /*
1610          * Blow out the MMU to ensure that no other VCPU has an active mapping
1611          * to ensure that the updated hypercall appears atomically across all
1612          * VCPUs.
1613          */
1614         kvm_mmu_zap_all(vcpu->kvm);
1615
1616         kvm_x86_ops->cache_regs(vcpu);
1617         kvm_x86_ops->patch_hypercall(vcpu, instruction);
1618         if (emulator_write_emulated(vcpu->rip, instruction, 3, vcpu)
1619             != X86EMUL_CONTINUE)
1620                 ret = -EFAULT;
1621
1622         mutex_unlock(&vcpu->kvm->lock);
1623
1624         return ret;
1625 }
1626
1627 static u64 mk_cr_64(u64 curr_cr, u32 new_val)
1628 {
1629         return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
1630 }
1631
1632 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1633 {
1634         struct descriptor_table dt = { limit, base };
1635
1636         kvm_x86_ops->set_gdt(vcpu, &dt);
1637 }
1638
1639 void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
1640 {
1641         struct descriptor_table dt = { limit, base };
1642
1643         kvm_x86_ops->set_idt(vcpu, &dt);
1644 }
1645
1646 void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
1647                    unsigned long *rflags)
1648 {
1649         lmsw(vcpu, msw);
1650         *rflags = kvm_x86_ops->get_rflags(vcpu);
1651 }
1652
1653 unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
1654 {
1655         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
1656         switch (cr) {
1657         case 0:
1658                 return vcpu->cr0;
1659         case 2:
1660                 return vcpu->cr2;
1661         case 3:
1662                 return vcpu->cr3;
1663         case 4:
1664                 return vcpu->cr4;
1665         default:
1666                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1667                 return 0;
1668         }
1669 }
1670
1671 void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
1672                      unsigned long *rflags)
1673 {
1674         switch (cr) {
1675         case 0:
1676                 set_cr0(vcpu, mk_cr_64(vcpu->cr0, val));
1677                 *rflags = kvm_x86_ops->get_rflags(vcpu);
1678                 break;
1679         case 2:
1680                 vcpu->cr2 = val;
1681                 break;
1682         case 3:
1683                 set_cr3(vcpu, val);
1684                 break;
1685         case 4:
1686                 set_cr4(vcpu, mk_cr_64(vcpu->cr4, val));
1687                 break;
1688         default:
1689                 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __FUNCTION__, cr);
1690         }
1691 }
1692
1693 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1694 {
1695         u64 data;
1696
1697         switch (msr) {
1698         case 0xc0010010: /* SYSCFG */
1699         case 0xc0010015: /* HWCR */
1700         case MSR_IA32_PLATFORM_ID:
1701         case MSR_IA32_P5_MC_ADDR:
1702         case MSR_IA32_P5_MC_TYPE:
1703         case MSR_IA32_MC0_CTL:
1704         case MSR_IA32_MCG_STATUS:
1705         case MSR_IA32_MCG_CAP:
1706         case MSR_IA32_MC0_MISC:
1707         case MSR_IA32_MC0_MISC+4:
1708         case MSR_IA32_MC0_MISC+8:
1709         case MSR_IA32_MC0_MISC+12:
1710         case MSR_IA32_MC0_MISC+16:
1711         case MSR_IA32_UCODE_REV:
1712         case MSR_IA32_PERF_STATUS:
1713         case MSR_IA32_EBL_CR_POWERON:
1714                 /* MTRR registers */
1715         case 0xfe:
1716         case 0x200 ... 0x2ff:
1717                 data = 0;
1718                 break;
1719         case 0xcd: /* fsb frequency */
1720                 data = 3;
1721                 break;
1722         case MSR_IA32_APICBASE:
1723                 data = kvm_get_apic_base(vcpu);
1724                 break;
1725         case MSR_IA32_MISC_ENABLE:
1726                 data = vcpu->ia32_misc_enable_msr;
1727                 break;
1728 #ifdef CONFIG_X86_64
1729         case MSR_EFER:
1730                 data = vcpu->shadow_efer;
1731                 break;
1732 #endif
1733         default:
1734                 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
1735                 return 1;
1736         }
1737         *pdata = data;
1738         return 0;
1739 }
1740 EXPORT_SYMBOL_GPL(kvm_get_msr_common);
1741
1742 /*
1743  * Reads an msr value (of 'msr_index') into 'pdata'.
1744  * Returns 0 on success, non-0 otherwise.
1745  * Assumes vcpu_load() was already called.
1746  */
1747 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1748 {
1749         return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
1750 }
1751
1752 #ifdef CONFIG_X86_64
1753
1754 static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
1755 {
1756         if (efer & EFER_RESERVED_BITS) {
1757                 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
1758                        efer);
1759                 inject_gp(vcpu);
1760                 return;
1761         }
1762
1763         if (is_paging(vcpu)
1764             && (vcpu->shadow_efer & EFER_LME) != (efer & EFER_LME)) {
1765                 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
1766                 inject_gp(vcpu);
1767                 return;
1768         }
1769
1770         kvm_x86_ops->set_efer(vcpu, efer);
1771
1772         efer &= ~EFER_LMA;
1773         efer |= vcpu->shadow_efer & EFER_LMA;
1774
1775         vcpu->shadow_efer = efer;
1776 }
1777
1778 #endif
1779
1780 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1781 {
1782         switch (msr) {
1783 #ifdef CONFIG_X86_64
1784         case MSR_EFER:
1785                 set_efer(vcpu, data);
1786                 break;
1787 #endif
1788         case MSR_IA32_MC0_STATUS:
1789                 pr_unimpl(vcpu, "%s: MSR_IA32_MC0_STATUS 0x%llx, nop\n",
1790                        __FUNCTION__, data);
1791                 break;
1792         case MSR_IA32_MCG_STATUS:
1793                 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_STATUS 0x%llx, nop\n",
1794                         __FUNCTION__, data);
1795                 break;
1796         case MSR_IA32_UCODE_REV:
1797         case MSR_IA32_UCODE_WRITE:
1798         case 0x200 ... 0x2ff: /* MTRRs */
1799                 break;
1800         case MSR_IA32_APICBASE:
1801                 kvm_set_apic_base(vcpu, data);
1802                 break;
1803         case MSR_IA32_MISC_ENABLE:
1804                 vcpu->ia32_misc_enable_msr = data;
1805                 break;
1806         default:
1807                 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x\n", msr);
1808                 return 1;
1809         }
1810         return 0;
1811 }
1812 EXPORT_SYMBOL_GPL(kvm_set_msr_common);
1813
1814 /*
1815  * Writes msr value into into the appropriate "register".
1816  * Returns 0 on success, non-0 otherwise.
1817  * Assumes vcpu_load() was already called.
1818  */
1819 int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1820 {
1821         return kvm_x86_ops->set_msr(vcpu, msr_index, data);
1822 }
1823
1824 void kvm_resched(struct kvm_vcpu *vcpu)
1825 {
1826         if (!need_resched())
1827                 return;
1828         cond_resched();
1829 }
1830 EXPORT_SYMBOL_GPL(kvm_resched);
1831
1832 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
1833 {
1834         int i;
1835         u32 function;
1836         struct kvm_cpuid_entry *e, *best;
1837
1838         kvm_x86_ops->cache_regs(vcpu);
1839         function = vcpu->regs[VCPU_REGS_RAX];
1840         vcpu->regs[VCPU_REGS_RAX] = 0;
1841         vcpu->regs[VCPU_REGS_RBX] = 0;
1842         vcpu->regs[VCPU_REGS_RCX] = 0;
1843         vcpu->regs[VCPU_REGS_RDX] = 0;
1844         best = NULL;
1845         for (i = 0; i < vcpu->cpuid_nent; ++i) {
1846                 e = &vcpu->cpuid_entries[i];
1847                 if (e->function == function) {
1848                         best = e;
1849                         break;
1850                 }
1851                 /*
1852                  * Both basic or both extended?
1853                  */
1854                 if (((e->function ^ function) & 0x80000000) == 0)
1855                         if (!best || e->function > best->function)
1856                                 best = e;
1857         }
1858         if (best) {
1859                 vcpu->regs[VCPU_REGS_RAX] = best->eax;
1860                 vcpu->regs[VCPU_REGS_RBX] = best->ebx;
1861                 vcpu->regs[VCPU_REGS_RCX] = best->ecx;
1862                 vcpu->regs[VCPU_REGS_RDX] = best->edx;
1863         }
1864         kvm_x86_ops->decache_regs(vcpu);
1865         kvm_x86_ops->skip_emulated_instruction(vcpu);
1866 }
1867 EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
1868
1869 static int pio_copy_data(struct kvm_vcpu *vcpu)
1870 {
1871         void *p = vcpu->pio_data;
1872         void *q;
1873         unsigned bytes;
1874         int nr_pages = vcpu->pio.guest_pages[1] ? 2 : 1;
1875
1876         q = vmap(vcpu->pio.guest_pages, nr_pages, VM_READ|VM_WRITE,
1877                  PAGE_KERNEL);
1878         if (!q) {
1879                 free_pio_guest_pages(vcpu);
1880                 return -ENOMEM;
1881         }
1882         q += vcpu->pio.guest_page_offset;
1883         bytes = vcpu->pio.size * vcpu->pio.cur_count;
1884         if (vcpu->pio.in)
1885                 memcpy(q, p, bytes);
1886         else
1887                 memcpy(p, q, bytes);
1888         q -= vcpu->pio.guest_page_offset;
1889         vunmap(q);
1890         free_pio_guest_pages(vcpu);
1891         return 0;
1892 }
1893
1894 static int complete_pio(struct kvm_vcpu *vcpu)
1895 {
1896         struct kvm_pio_request *io = &vcpu->pio;
1897         long delta;
1898         int r;
1899
1900         kvm_x86_ops->cache_regs(vcpu);
1901
1902         if (!io->string) {
1903                 if (io->in)
1904                         memcpy(&vcpu->regs[VCPU_REGS_RAX], vcpu->pio_data,
1905                                io->size);
1906         } else {
1907                 if (io->in) {
1908                         r = pio_copy_data(vcpu);
1909                         if (r) {
1910                                 kvm_x86_ops->cache_regs(vcpu);
1911                                 return r;
1912                         }
1913                 }
1914
1915                 delta = 1;
1916                 if (io->rep) {
1917                         delta *= io->cur_count;
1918                         /*
1919                          * The size of the register should really depend on
1920                          * current address size.
1921                          */
1922                         vcpu->regs[VCPU_REGS_RCX] -= delta;
1923                 }
1924                 if (io->down)
1925                         delta = -delta;
1926                 delta *= io->size;
1927                 if (io->in)
1928                         vcpu->regs[VCPU_REGS_RDI] += delta;
1929                 else
1930                         vcpu->regs[VCPU_REGS_RSI] += delta;
1931         }
1932
1933         kvm_x86_ops->decache_regs(vcpu);
1934
1935         io->count -= io->cur_count;
1936         io->cur_count = 0;
1937
1938         return 0;
1939 }
1940
1941 static void kernel_pio(struct kvm_io_device *pio_dev,
1942                        struct kvm_vcpu *vcpu,
1943                        void *pd)
1944 {
1945         /* TODO: String I/O for in kernel device */
1946
1947         mutex_lock(&vcpu->kvm->lock);
1948         if (vcpu->pio.in)
1949                 kvm_iodevice_read(pio_dev, vcpu->pio.port,
1950                                   vcpu->pio.size,
1951                                   pd);
1952         else
1953                 kvm_iodevice_write(pio_dev, vcpu->pio.port,
1954                                    vcpu->pio.size,
1955                                    pd);
1956         mutex_unlock(&vcpu->kvm->lock);
1957 }
1958
1959 static void pio_string_write(struct kvm_io_device *pio_dev,
1960                              struct kvm_vcpu *vcpu)
1961 {
1962         struct kvm_pio_request *io = &vcpu->pio;
1963         void *pd = vcpu->pio_data;
1964         int i;
1965
1966         mutex_lock(&vcpu->kvm->lock);
1967         for (i = 0; i < io->cur_count; i++) {
1968                 kvm_iodevice_write(pio_dev, io->port,
1969                                    io->size,
1970                                    pd);
1971                 pd += io->size;
1972         }
1973         mutex_unlock(&vcpu->kvm->lock);
1974 }
1975
1976 int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
1977                   int size, unsigned port)
1978 {
1979         struct kvm_io_device *pio_dev;
1980
1981         vcpu->run->exit_reason = KVM_EXIT_IO;
1982         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
1983         vcpu->run->io.size = vcpu->pio.size = size;
1984         vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
1985         vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = 1;
1986         vcpu->run->io.port = vcpu->pio.port = port;
1987         vcpu->pio.in = in;
1988         vcpu->pio.string = 0;
1989         vcpu->pio.down = 0;
1990         vcpu->pio.guest_page_offset = 0;
1991         vcpu->pio.rep = 0;
1992
1993         kvm_x86_ops->cache_regs(vcpu);
1994         memcpy(vcpu->pio_data, &vcpu->regs[VCPU_REGS_RAX], 4);
1995         kvm_x86_ops->decache_regs(vcpu);
1996
1997         kvm_x86_ops->skip_emulated_instruction(vcpu);
1998
1999         pio_dev = vcpu_find_pio_dev(vcpu, port);
2000         if (pio_dev) {
2001                 kernel_pio(pio_dev, vcpu, vcpu->pio_data);
2002                 complete_pio(vcpu);
2003                 return 1;
2004         }
2005         return 0;
2006 }
2007 EXPORT_SYMBOL_GPL(kvm_emulate_pio);
2008
2009 int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2010                   int size, unsigned long count, int down,
2011                   gva_t address, int rep, unsigned port)
2012 {
2013         unsigned now, in_page;
2014         int i, ret = 0;
2015         int nr_pages = 1;
2016         struct page *page;
2017         struct kvm_io_device *pio_dev;
2018
2019         vcpu->run->exit_reason = KVM_EXIT_IO;
2020         vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
2021         vcpu->run->io.size = vcpu->pio.size = size;
2022         vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
2023         vcpu->run->io.count = vcpu->pio.count = vcpu->pio.cur_count = count;
2024         vcpu->run->io.port = vcpu->pio.port = port;
2025         vcpu->pio.in = in;
2026         vcpu->pio.string = 1;
2027         vcpu->pio.down = down;
2028         vcpu->pio.guest_page_offset = offset_in_page(address);
2029         vcpu->pio.rep = rep;
2030
2031         if (!count) {
2032                 kvm_x86_ops->skip_emulated_instruction(vcpu);
2033                 return 1;
2034         }
2035
2036         if (!down)
2037                 in_page = PAGE_SIZE - offset_in_page(address);
2038         else
2039                 in_page = offset_in_page(address) + size;
2040         now = min(count, (unsigned long)in_page / size);
2041         if (!now) {
2042                 /*
2043                  * String I/O straddles page boundary.  Pin two guest pages
2044                  * so that we satisfy atomicity constraints.  Do just one
2045                  * transaction to avoid complexity.
2046                  */
2047                 nr_pages = 2;
2048                 now = 1;
2049         }
2050         if (down) {
2051                 /*
2052                  * String I/O in reverse.  Yuck.  Kill the guest, fix later.
2053                  */
2054                 pr_unimpl(vcpu, "guest string pio down\n");
2055                 inject_gp(vcpu);
2056                 return 1;
2057         }
2058         vcpu->run->io.count = now;
2059         vcpu->pio.cur_count = now;
2060
2061         if (vcpu->pio.cur_count == vcpu->pio.count)
2062                 kvm_x86_ops->skip_emulated_instruction(vcpu);
2063
2064         for (i = 0; i < nr_pages; ++i) {
2065                 mutex_lock(&vcpu->kvm->lock);
2066                 page = gva_to_page(vcpu, address + i * PAGE_SIZE);
2067                 if (page)
2068                         get_page(page);
2069                 vcpu->pio.guest_pages[i] = page;
2070                 mutex_unlock(&vcpu->kvm->lock);
2071                 if (!page) {
2072                         inject_gp(vcpu);
2073                         free_pio_guest_pages(vcpu);
2074                         return 1;
2075                 }
2076         }
2077
2078         pio_dev = vcpu_find_pio_dev(vcpu, port);
2079         if (!vcpu->pio.in) {
2080                 /* string PIO write */
2081                 ret = pio_copy_data(vcpu);
2082                 if (ret >= 0 && pio_dev) {
2083                         pio_string_write(pio_dev, vcpu);
2084                         complete_pio(vcpu);
2085                         if (vcpu->pio.count == 0)
2086                                 ret = 1;
2087                 }
2088         } else if (pio_dev)
2089                 pr_unimpl(vcpu, "no string pio read support yet, "
2090                        "port %x size %d count %ld\n",
2091                         port, size, count);
2092
2093         return ret;
2094 }
2095 EXPORT_SYMBOL_GPL(kvm_emulate_pio_string);
2096
2097 /*
2098  * Check if userspace requested an interrupt window, and that the
2099  * interrupt window is open.
2100  *
2101  * No need to exit to userspace if we already have an interrupt queued.
2102  */
2103 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
2104                                           struct kvm_run *kvm_run)
2105 {
2106         return (!vcpu->irq_summary &&
2107                 kvm_run->request_interrupt_window &&
2108                 vcpu->interrupt_window_open &&
2109                 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
2110 }
2111
2112 static void post_kvm_run_save(struct kvm_vcpu *vcpu,
2113                               struct kvm_run *kvm_run)
2114 {
2115         kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
2116         kvm_run->cr8 = get_cr8(vcpu);
2117         kvm_run->apic_base = kvm_get_apic_base(vcpu);
2118         if (irqchip_in_kernel(vcpu->kvm))
2119                 kvm_run->ready_for_interrupt_injection = 1;
2120         else
2121                 kvm_run->ready_for_interrupt_injection =
2122                                         (vcpu->interrupt_window_open &&
2123                                          vcpu->irq_summary == 0);
2124 }
2125
2126 static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2127 {
2128         int r;
2129
2130         if (unlikely(vcpu->mp_state == VCPU_MP_STATE_SIPI_RECEIVED)) {
2131                 pr_debug("vcpu %d received sipi with vector # %x\n",
2132                        vcpu->vcpu_id, vcpu->sipi_vector);
2133                 kvm_lapic_reset(vcpu);
2134                 kvm_x86_ops->vcpu_reset(vcpu);
2135                 vcpu->mp_state = VCPU_MP_STATE_RUNNABLE;
2136         }
2137
2138 preempted:
2139         if (vcpu->guest_debug.enabled)
2140                 kvm_x86_ops->guest_debug_pre(vcpu);
2141
2142 again:
2143         r = kvm_mmu_reload(vcpu);
2144         if (unlikely(r))
2145                 goto out;
2146
2147         preempt_disable();
2148
2149         kvm_x86_ops->prepare_guest_switch(vcpu);
2150         kvm_load_guest_fpu(vcpu);
2151
2152         local_irq_disable();
2153
2154         if (signal_pending(current)) {
2155                 local_irq_enable();
2156                 preempt_enable();
2157                 r = -EINTR;
2158                 kvm_run->exit_reason = KVM_EXIT_INTR;
2159                 ++vcpu->stat.signal_exits;
2160                 goto out;
2161         }
2162
2163         if (irqchip_in_kernel(vcpu->kvm))
2164                 kvm_x86_ops->inject_pending_irq(vcpu);
2165         else if (!vcpu->mmio_read_completed)
2166                 kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
2167
2168         vcpu->guest_mode = 1;
2169         kvm_guest_enter();
2170
2171         if (vcpu->requests)
2172                 if (test_and_clear_bit(KVM_TLB_FLUSH, &vcpu->requests))
2173                         kvm_x86_ops->tlb_flush(vcpu);
2174
2175         kvm_x86_ops->run(vcpu, kvm_run);
2176
2177         vcpu->guest_mode = 0;
2178         local_irq_enable();
2179
2180         ++vcpu->stat.exits;
2181
2182         /*
2183          * We must have an instruction between local_irq_enable() and
2184          * kvm_guest_exit(), so the timer interrupt isn't delayed by
2185          * the interrupt shadow.  The stat.exits increment will do nicely.
2186          * But we need to prevent reordering, hence this barrier():
2187          */
2188         barrier();
2189
2190         kvm_guest_exit();
2191
2192         preempt_enable();
2193
2194         /*
2195          * Profile KVM exit RIPs:
2196          */
2197         if (unlikely(prof_on == KVM_PROFILING)) {
2198                 kvm_x86_ops->cache_regs(vcpu);
2199                 profile_hit(KVM_PROFILING, (void *)vcpu->rip);
2200         }
2201
2202         r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
2203
2204         if (r > 0) {
2205                 if (dm_request_for_irq_injection(vcpu, kvm_run)) {
2206                         r = -EINTR;
2207                         kvm_run->exit_reason = KVM_EXIT_INTR;
2208                         ++vcpu->stat.request_irq_exits;
2209                         goto out;
2210                 }
2211                 if (!need_resched()) {
2212                         ++vcpu->stat.light_exits;
2213                         goto again;
2214                 }
2215         }
2216
2217 out:
2218         if (r > 0) {
2219                 kvm_resched(vcpu);
2220                 goto preempted;
2221         }
2222
2223         post_kvm_run_save(vcpu, kvm_run);
2224
2225         return r;
2226 }
2227
2228
2229 static int kvm_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2230 {
2231         int r;
2232         sigset_t sigsaved;
2233
2234         vcpu_load(vcpu);
2235
2236         if (unlikely(vcpu->mp_state == VCPU_MP_STATE_UNINITIALIZED)) {
2237                 kvm_vcpu_block(vcpu);
2238                 vcpu_put(vcpu);
2239                 return -EAGAIN;
2240         }
2241
2242         if (vcpu->sigset_active)
2243                 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
2244
2245         /* re-sync apic's tpr */
2246         if (!irqchip_in_kernel(vcpu->kvm))
2247                 set_cr8(vcpu, kvm_run->cr8);
2248
2249         if (vcpu->pio.cur_count) {
2250                 r = complete_pio(vcpu);
2251                 if (r)
2252                         goto out;
2253         }
2254
2255         if (vcpu->mmio_needed) {
2256                 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
2257                 vcpu->mmio_read_completed = 1;
2258                 vcpu->mmio_needed = 0;
2259                 r = emulate_instruction(vcpu, kvm_run,
2260                                         vcpu->mmio_fault_cr2, 0, 1);
2261                 if (r == EMULATE_DO_MMIO) {
2262                         /*
2263                          * Read-modify-write.  Back to userspace.
2264                          */
2265                         r = 0;
2266                         goto out;
2267                 }
2268         }
2269
2270         if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) {
2271                 kvm_x86_ops->cache_regs(vcpu);
2272                 vcpu->regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret;
2273                 kvm_x86_ops->decache_regs(vcpu);
2274         }
2275
2276         r = __vcpu_run(vcpu, kvm_run);
2277
2278 out:
2279         if (vcpu->sigset_active)
2280                 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
2281
2282         vcpu_put(vcpu);
2283         return r;
2284 }
2285
2286 static int kvm_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu,
2287                                    struct kvm_regs *regs)
2288 {
2289         vcpu_load(vcpu);
2290
2291         kvm_x86_ops->cache_regs(vcpu);
2292
2293         regs->rax = vcpu->regs[VCPU_REGS_RAX];
2294         regs->rbx = vcpu->regs[VCPU_REGS_RBX];
2295         regs->rcx = vcpu->regs[VCPU_REGS_RCX];
2296         regs->rdx = vcpu->regs[VCPU_REGS_RDX];
2297         regs->rsi = vcpu->regs[VCPU_REGS_RSI];
2298         regs->rdi = vcpu->regs[VCPU_REGS_RDI];
2299         regs->rsp = vcpu->regs[VCPU_REGS_RSP];
2300         regs->rbp = vcpu->regs[VCPU_REGS_RBP];
2301 #ifdef CONFIG_X86_64
2302         regs->r8 = vcpu->regs[VCPU_REGS_R8];
2303         regs->r9 = vcpu->regs[VCPU_REGS_R9];
2304         regs->r10 = vcpu->regs[VCPU_REGS_R10];
2305         regs->r11 = vcpu->regs[VCPU_REGS_R11];
2306         regs->r12 = vcpu->regs[VCPU_REGS_R12];
2307         regs->r13 = vcpu->regs[VCPU_REGS_R13];
2308         regs->r14 = vcpu->regs[VCPU_REGS_R14];
2309         regs->r15 = vcpu->regs[VCPU_REGS_R15];
2310 #endif
2311
2312         regs->rip = vcpu->rip;
2313         regs->rflags = kvm_x86_ops->get_rflags(vcpu);
2314
2315         /*
2316          * Don't leak debug flags in case they were set for guest debugging
2317          */
2318         if (vcpu->guest_debug.enabled && vcpu->guest_debug.singlestep)
2319                 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
2320
2321         vcpu_put(vcpu);
2322
2323         return 0;
2324 }
2325
2326 static int kvm_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu,
2327                                    struct kvm_regs *regs)
2328 {
2329         vcpu_load(vcpu);
2330
2331         vcpu->regs[VCPU_REGS_RAX] = regs->rax;
2332         vcpu->regs[VCPU_REGS_RBX] = regs->rbx;
2333         vcpu->regs[VCPU_REGS_RCX] = regs->rcx;
2334         vcpu->regs[VCPU_REGS_RDX] = regs->rdx;
2335         vcpu->regs[VCPU_REGS_RSI] = regs->rsi;
2336         vcpu->regs[VCPU_REGS_RDI] = regs->rdi;
2337         vcpu->regs[VCPU_REGS_RSP] = regs->rsp;
2338         vcpu->regs[VCPU_REGS_RBP] = regs->rbp;
2339 #ifdef CONFIG_X86_64
2340         vcpu->regs[VCPU_REGS_R8] = regs->r8;
2341         vcpu->regs[VCPU_REGS_R9] = regs->r9;
2342         vcpu->regs[VCPU_REGS_R10] = regs->r10;
2343         vcpu->regs[VCPU_REGS_R11] = regs->r11;
2344         vcpu->regs[VCPU_REGS_R12] = regs->r12;
2345         vcpu->regs[VCPU_REGS_R13] = regs->r13;
2346         vcpu->regs[VCPU_REGS_R14] = regs->r14;
2347         vcpu->regs[VCPU_REGS_R15] = regs->r15;
2348 #endif
2349
2350         vcpu->rip = regs->rip;
2351         kvm_x86_ops->set_rflags(vcpu, regs->rflags);
2352
2353         kvm_x86_ops->decache_regs(vcpu);
2354
2355         vcpu_put(vcpu);
2356
2357         return 0;
2358 }
2359
2360 static void get_segment(struct kvm_vcpu *vcpu,
2361                         struct kvm_segment *var, int seg)
2362 {
2363         return kvm_x86_ops->get_segment(vcpu, var, seg);
2364 }
2365
2366 static int kvm_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
2367                                     struct kvm_sregs *sregs)
2368 {
2369         struct descriptor_table dt;
2370         int pending_vec;
2371
2372         vcpu_load(vcpu);
2373
2374         get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
2375         get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
2376         get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
2377         get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
2378         get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
2379         get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
2380
2381         get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
2382         get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
2383
2384         kvm_x86_ops->get_idt(vcpu, &dt);
2385         sregs->idt.limit = dt.limit;
2386         sregs->idt.base = dt.base;
2387         kvm_x86_ops->get_gdt(vcpu, &dt);
2388         sregs->gdt.limit = dt.limit;
2389         sregs->gdt.base = dt.base;
2390
2391         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2392         sregs->cr0 = vcpu->cr0;
2393         sregs->cr2 = vcpu->cr2;
2394         sregs->cr3 = vcpu->cr3;
2395         sregs->cr4 = vcpu->cr4;
2396         sregs->cr8 = get_cr8(vcpu);
2397         sregs->efer = vcpu->shadow_efer;
2398         sregs->apic_base = kvm_get_apic_base(vcpu);
2399
2400         if (irqchip_in_kernel(vcpu->kvm)) {
2401                 memset(sregs->interrupt_bitmap, 0,
2402                        sizeof sregs->interrupt_bitmap);
2403                 pending_vec = kvm_x86_ops->get_irq(vcpu);
2404                 if (pending_vec >= 0)
2405                         set_bit(pending_vec,
2406                                 (unsigned long *)sregs->interrupt_bitmap);
2407         } else
2408                 memcpy(sregs->interrupt_bitmap, vcpu->irq_pending,
2409                        sizeof sregs->interrupt_bitmap);
2410
2411         vcpu_put(vcpu);
2412
2413         return 0;
2414 }
2415
2416 static void set_segment(struct kvm_vcpu *vcpu,
2417                         struct kvm_segment *var, int seg)
2418 {
2419         return kvm_x86_ops->set_segment(vcpu, var, seg);
2420 }
2421
2422 static int kvm_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
2423                                     struct kvm_sregs *sregs)
2424 {
2425         int mmu_reset_needed = 0;
2426         int i, pending_vec, max_bits;
2427         struct descriptor_table dt;
2428
2429         vcpu_load(vcpu);
2430
2431         dt.limit = sregs->idt.limit;
2432         dt.base = sregs->idt.base;
2433         kvm_x86_ops->set_idt(vcpu, &dt);
2434         dt.limit = sregs->gdt.limit;
2435         dt.base = sregs->gdt.base;
2436         kvm_x86_ops->set_gdt(vcpu, &dt);
2437
2438         vcpu->cr2 = sregs->cr2;
2439         mmu_reset_needed |= vcpu->cr3 != sregs->cr3;
2440         vcpu->cr3 = sregs->cr3;
2441
2442         set_cr8(vcpu, sregs->cr8);
2443
2444         mmu_reset_needed |= vcpu->shadow_efer != sregs->efer;
2445 #ifdef CONFIG_X86_64
2446         kvm_x86_ops->set_efer(vcpu, sregs->efer);
2447 #endif
2448         kvm_set_apic_base(vcpu, sregs->apic_base);
2449
2450         kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2451
2452         mmu_reset_needed |= vcpu->cr0 != sregs->cr0;
2453         vcpu->cr0 = sregs->cr0;
2454         kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
2455
2456         mmu_reset_needed |= vcpu->cr4 != sregs->cr4;
2457         kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
2458         if (!is_long_mode(vcpu) && is_pae(vcpu))
2459                 load_pdptrs(vcpu, vcpu->cr3);
2460
2461         if (mmu_reset_needed)
2462                 kvm_mmu_reset_context(vcpu);
2463
2464         if (!irqchip_in_kernel(vcpu->kvm)) {
2465                 memcpy(vcpu->irq_pending, sregs->interrupt_bitmap,
2466                        sizeof vcpu->irq_pending);
2467                 vcpu->irq_summary = 0;
2468                 for (i = 0; i < ARRAY_SIZE(vcpu->irq_pending); ++i)
2469                         if (vcpu->irq_pending[i])
2470                                 __set_bit(i, &vcpu->irq_summary);
2471         } else {
2472                 max_bits = (sizeof sregs->interrupt_bitmap) << 3;
2473                 pending_vec = find_first_bit(
2474                         (const unsigned long *)sregs->interrupt_bitmap,
2475                         max_bits);
2476                 /* Only pending external irq is handled here */
2477                 if (pending_vec < max_bits) {
2478                         kvm_x86_ops->set_irq(vcpu, pending_vec);
2479                         pr_debug("Set back pending irq %d\n",
2480                                  pending_vec);
2481                 }
2482         }
2483
2484         set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
2485         set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
2486         set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
2487         set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
2488         set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
2489         set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
2490
2491         set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
2492         set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
2493
2494         vcpu_put(vcpu);
2495
2496         return 0;
2497 }
2498
2499 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
2500 {
2501         struct kvm_segment cs;
2502
2503         get_segment(vcpu, &cs, VCPU_SREG_CS);
2504         *db = cs.db;
2505         *l = cs.l;
2506 }
2507 EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
2508
2509 /*
2510  * Translate a guest virtual address to a guest physical address.
2511  */
2512 static int kvm_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
2513                                     struct kvm_translation *tr)
2514 {
2515         unsigned long vaddr = tr->linear_address;
2516         gpa_t gpa;
2517
2518         vcpu_load(vcpu);
2519         mutex_lock(&vcpu->kvm->lock);
2520         gpa = vcpu->mmu.gva_to_gpa(vcpu, vaddr);
2521         tr->physical_address = gpa;
2522         tr->valid = gpa != UNMAPPED_GVA;
2523         tr->writeable = 1;
2524         tr->usermode = 0;
2525         mutex_unlock(&vcpu->kvm->lock);
2526         vcpu_put(vcpu);
2527
2528         return 0;
2529 }
2530
2531 static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
2532                                     struct kvm_interrupt *irq)
2533 {
2534         if (irq->irq < 0 || irq->irq >= 256)
2535                 return -EINVAL;
2536         if (irqchip_in_kernel(vcpu->kvm))
2537                 return -ENXIO;
2538         vcpu_load(vcpu);
2539
2540         set_bit(irq->irq, vcpu->irq_pending);
2541         set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary);
2542
2543         vcpu_put(vcpu);
2544
2545         return 0;
2546 }
2547
2548 static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
2549                                       struct kvm_debug_guest *dbg)
2550 {
2551         int r;
2552
2553         vcpu_load(vcpu);
2554
2555         r = kvm_x86_ops->set_guest_debug(vcpu, dbg);
2556
2557         vcpu_put(vcpu);
2558
2559         return r;
2560 }
2561
2562 static struct page *kvm_vcpu_nopage(struct vm_area_struct *vma,
2563                                     unsigned long address,
2564                                     int *type)
2565 {
2566         struct kvm_vcpu *vcpu = vma->vm_file->private_data;
2567         unsigned long pgoff;
2568         struct page *page;
2569
2570         pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2571         if (pgoff == 0)
2572                 page = virt_to_page(vcpu->run);
2573         else if (pgoff == KVM_PIO_PAGE_OFFSET)
2574                 page = virt_to_page(vcpu->pio_data);
2575         else
2576                 return NOPAGE_SIGBUS;
2577         get_page(page);
2578         if (type != NULL)
2579                 *type = VM_FAULT_MINOR;
2580
2581         return page;
2582 }
2583
2584 static struct vm_operations_struct kvm_vcpu_vm_ops = {
2585         .nopage = kvm_vcpu_nopage,
2586 };
2587
2588 static int kvm_vcpu_mmap(struct file *file, struct vm_area_struct *vma)
2589 {
2590         vma->vm_ops = &kvm_vcpu_vm_ops;
2591         return 0;
2592 }
2593
2594 static int kvm_vcpu_release(struct inode *inode, struct file *filp)
2595 {
2596         struct kvm_vcpu *vcpu = filp->private_data;
2597
2598         fput(vcpu->kvm->filp);
2599         return 0;
2600 }
2601
2602 static struct file_operations kvm_vcpu_fops = {
2603         .release        = kvm_vcpu_release,
2604         .unlocked_ioctl = kvm_vcpu_ioctl,
2605         .compat_ioctl   = kvm_vcpu_ioctl,
2606         .mmap           = kvm_vcpu_mmap,
2607 };
2608
2609 /*
2610  * Allocates an inode for the vcpu.
2611  */
2612 static int create_vcpu_fd(struct kvm_vcpu *vcpu)
2613 {
2614         int fd, r;
2615         struct inode *inode;
2616         struct file *file;
2617
2618         r = anon_inode_getfd(&fd, &inode, &file,
2619                              "kvm-vcpu", &kvm_vcpu_fops, vcpu);
2620         if (r)
2621                 return r;
2622         atomic_inc(&vcpu->kvm->filp->f_count);
2623         return fd;
2624 }
2625
2626 /*
2627  * Creates some virtual cpus.  Good luck creating more than one.
2628  */
2629 static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, int n)
2630 {
2631         int r;
2632         struct kvm_vcpu *vcpu;
2633
2634         if (!valid_vcpu(n))
2635                 return -EINVAL;
2636
2637         vcpu = kvm_x86_ops->vcpu_create(kvm, n);
2638         if (IS_ERR(vcpu))
2639                 return PTR_ERR(vcpu);
2640
2641         preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
2642
2643         /* We do fxsave: this must be aligned. */
2644         BUG_ON((unsigned long)&vcpu->host_fx_image & 0xF);
2645
2646         vcpu_load(vcpu);
2647         r = kvm_mmu_setup(vcpu);
2648         vcpu_put(vcpu);
2649         if (r < 0)
2650                 goto free_vcpu;
2651
2652         mutex_lock(&kvm->lock);
2653         if (kvm->vcpus[n]) {
2654                 r = -EEXIST;
2655                 mutex_unlock(&kvm->lock);
2656                 goto mmu_unload;
2657         }
2658         kvm->vcpus[n] = vcpu;
2659         mutex_unlock(&kvm->lock);
2660
2661         /* Now it's all set up, let userspace reach it */
2662         r = create_vcpu_fd(vcpu);
2663         if (r < 0)
2664                 goto unlink;
2665         return r;
2666
2667 unlink:
2668         mutex_lock(&kvm->lock);
2669         kvm->vcpus[n] = NULL;
2670         mutex_unlock(&kvm->lock);
2671
2672 mmu_unload:
2673         vcpu_load(vcpu);
2674         kvm_mmu_unload(vcpu);
2675         vcpu_put(vcpu);
2676
2677 free_vcpu:
2678         kvm_x86_ops->vcpu_free(vcpu);
2679         return r;
2680 }
2681
2682 static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
2683 {
2684         if (sigset) {
2685                 sigdelsetmask(sigset, sigmask(SIGKILL)|sigmask(SIGSTOP));
2686                 vcpu->sigset_active = 1;
2687                 vcpu->sigset = *sigset;
2688         } else
2689                 vcpu->sigset_active = 0;
2690         return 0;
2691 }
2692
2693 /*
2694  * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
2695  * we have asm/x86/processor.h
2696  */
2697 struct fxsave {
2698         u16     cwd;
2699         u16     swd;
2700         u16     twd;
2701         u16     fop;
2702         u64     rip;
2703         u64     rdp;
2704         u32     mxcsr;
2705         u32     mxcsr_mask;
2706         u32     st_space[32];   /* 8*16 bytes for each FP-reg = 128 bytes */
2707 #ifdef CONFIG_X86_64
2708         u32     xmm_space[64];  /* 16*16 bytes for each XMM-reg = 256 bytes */
2709 #else
2710         u32     xmm_space[32];  /* 8*16 bytes for each XMM-reg = 128 bytes */
2711 #endif
2712 };
2713
2714 static int kvm_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2715 {
2716         struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image;
2717
2718         vcpu_load(vcpu);
2719
2720         memcpy(fpu->fpr, fxsave->st_space, 128);
2721         fpu->fcw = fxsave->cwd;
2722         fpu->fsw = fxsave->swd;
2723         fpu->ftwx = fxsave->twd;
2724         fpu->last_opcode = fxsave->fop;
2725         fpu->last_ip = fxsave->rip;
2726         fpu->last_dp = fxsave->rdp;
2727         memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
2728
2729         vcpu_put(vcpu);
2730
2731         return 0;
2732 }
2733
2734 static int kvm_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
2735 {
2736         struct fxsave *fxsave = (struct fxsave *)&vcpu->guest_fx_image;
2737
2738         vcpu_load(vcpu);
2739
2740         memcpy(fxsave->st_space, fpu->fpr, 128);
2741         fxsave->cwd = fpu->fcw;
2742         fxsave->swd = fpu->fsw;
2743         fxsave->twd = fpu->ftwx;
2744         fxsave->fop = fpu->last_opcode;
2745         fxsave->rip = fpu->last_ip;
2746         fxsave->rdp = fpu->last_dp;
2747         memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
2748
2749         vcpu_put(vcpu);
2750
2751         return 0;
2752 }
2753
2754 static long kvm_vcpu_ioctl(struct file *filp,
2755                            unsigned int ioctl, unsigned long arg)
2756 {
2757         struct kvm_vcpu *vcpu = filp->private_data;
2758         void __user *argp = (void __user *)arg;
2759         int r;
2760
2761         switch (ioctl) {
2762         case KVM_RUN:
2763                 r = -EINVAL;
2764                 if (arg)
2765                         goto out;
2766                 r = kvm_vcpu_ioctl_run(vcpu, vcpu->run);
2767                 break;
2768         case KVM_GET_REGS: {
2769                 struct kvm_regs kvm_regs;
2770
2771                 memset(&kvm_regs, 0, sizeof kvm_regs);
2772                 r = kvm_vcpu_ioctl_get_regs(vcpu, &kvm_regs);
2773                 if (r)
2774                         goto out;
2775                 r = -EFAULT;
2776                 if (copy_to_user(argp, &kvm_regs, sizeof kvm_regs))
2777                         goto out;
2778                 r = 0;
2779                 break;
2780         }
2781         case KVM_SET_REGS: {
2782                 struct kvm_regs kvm_regs;
2783
2784                 r = -EFAULT;
2785                 if (copy_from_user(&kvm_regs, argp, sizeof kvm_regs))
2786                         goto out;
2787                 r = kvm_vcpu_ioctl_set_regs(vcpu, &kvm_regs);
2788                 if (r)
2789                         goto out;
2790                 r = 0;
2791                 break;
2792         }
2793         case KVM_GET_SREGS: {
2794                 struct kvm_sregs kvm_sregs;
2795
2796                 memset(&kvm_sregs, 0, sizeof kvm_sregs);
2797                 r = kvm_vcpu_ioctl_get_sregs(vcpu, &kvm_sregs);
2798                 if (r)
2799                         goto out;
2800                 r = -EFAULT;
2801                 if (copy_to_user(argp, &kvm_sregs, sizeof kvm_sregs))
2802                         goto out;
2803                 r = 0;
2804                 break;
2805         }
2806         case KVM_SET_SREGS: {
2807                 struct kvm_sregs kvm_sregs;
2808
2809                 r = -EFAULT;
2810                 if (copy_from_user(&kvm_sregs, argp, sizeof kvm_sregs))
2811                         goto out;
2812                 r = kvm_vcpu_ioctl_set_sregs(vcpu, &kvm_sregs);
2813                 if (r)
2814                         goto out;
2815                 r = 0;
2816                 break;
2817         }
2818         case KVM_TRANSLATE: {
2819                 struct kvm_translation tr;
2820
2821                 r = -EFAULT;
2822                 if (copy_from_user(&tr, argp, sizeof tr))
2823                         goto out;
2824                 r = kvm_vcpu_ioctl_translate(vcpu, &tr);
2825                 if (r)
2826                         goto out;
2827                 r = -EFAULT;
2828                 if (copy_to_user(argp, &tr, sizeof tr))
2829                         goto out;
2830                 r = 0;
2831                 break;
2832         }
2833         case KVM_INTERRUPT: {
2834                 struct kvm_interrupt irq;
2835
2836                 r = -EFAULT;
2837                 if (copy_from_user(&irq, argp, sizeof irq))
2838                         goto out;
2839                 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
2840                 if (r)
2841                         goto out;
2842                 r = 0;
2843                 break;
2844         }
2845         case KVM_DEBUG_GUEST: {
2846                 struct kvm_debug_guest dbg;
2847
2848                 r = -EFAULT;
2849                 if (copy_from_user(&dbg, argp, sizeof dbg))
2850                         goto out;
2851                 r = kvm_vcpu_ioctl_debug_guest(vcpu, &dbg);
2852                 if (r)
2853                         goto out;
2854                 r = 0;
2855                 break;
2856         }
2857         case KVM_SET_SIGNAL_MASK: {
2858                 struct kvm_signal_mask __user *sigmask_arg = argp;
2859                 struct kvm_signal_mask kvm_sigmask;
2860                 sigset_t sigset, *p;
2861
2862                 p = NULL;
2863                 if (argp) {
2864                         r = -EFAULT;
2865                         if (copy_from_user(&kvm_sigmask, argp,
2866                                            sizeof kvm_sigmask))
2867                                 goto out;
2868                         r = -EINVAL;
2869                         if (kvm_sigmask.len != sizeof sigset)
2870                                 goto out;
2871                         r = -EFAULT;
2872                         if (copy_from_user(&sigset, sigmask_arg->sigset,
2873                                            sizeof sigset))
2874                                 goto out;
2875                         p = &sigset;
2876                 }
2877                 r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
2878                 break;
2879         }
2880         case KVM_GET_FPU: {
2881                 struct kvm_fpu fpu;
2882
2883                 memset(&fpu, 0, sizeof fpu);
2884                 r = kvm_vcpu_ioctl_get_fpu(vcpu, &fpu);
2885                 if (r)
2886                         goto out;
2887                 r = -EFAULT;
2888                 if (copy_to_user(argp, &fpu, sizeof fpu))
2889                         goto out;
2890                 r = 0;
2891                 break;
2892         }
2893         case KVM_SET_FPU: {
2894                 struct kvm_fpu fpu;
2895
2896                 r = -EFAULT;
2897                 if (copy_from_user(&fpu, argp, sizeof fpu))
2898                         goto out;
2899                 r = kvm_vcpu_ioctl_set_fpu(vcpu, &fpu);
2900                 if (r)
2901                         goto out;
2902                 r = 0;
2903                 break;
2904         }
2905         default:
2906                 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
2907         }
2908 out:
2909         return r;
2910 }
2911
2912 static long kvm_vm_ioctl(struct file *filp,
2913                            unsigned int ioctl, unsigned long arg)
2914 {
2915         struct kvm *kvm = filp->private_data;
2916         void __user *argp = (void __user *)arg;
2917         int r = -EINVAL;
2918
2919         switch (ioctl) {
2920         case KVM_CREATE_VCPU:
2921                 r = kvm_vm_ioctl_create_vcpu(kvm, arg);
2922                 if (r < 0)
2923                         goto out;
2924                 break;
2925         case KVM_SET_MEMORY_REGION: {
2926                 struct kvm_memory_region kvm_mem;
2927                 struct kvm_userspace_memory_region kvm_userspace_mem;
2928
2929                 r = -EFAULT;
2930                 if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
2931                         goto out;
2932                 kvm_userspace_mem.slot = kvm_mem.slot;
2933                 kvm_userspace_mem.flags = kvm_mem.flags;
2934                 kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
2935                 kvm_userspace_mem.memory_size = kvm_mem.memory_size;
2936                 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
2937                 if (r)
2938                         goto out;
2939                 break;
2940         }
2941         case KVM_SET_USER_MEMORY_REGION: {
2942                 struct kvm_userspace_memory_region kvm_userspace_mem;
2943
2944                 r = -EFAULT;
2945                 if (copy_from_user(&kvm_userspace_mem, argp,
2946                                                 sizeof kvm_userspace_mem))
2947                         goto out;
2948
2949                 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1);
2950                 if (r)
2951                         goto out;
2952                 break;
2953         }
2954         case KVM_SET_NR_MMU_PAGES:
2955                 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
2956                 if (r)
2957                         goto out;
2958                 break;
2959         case KVM_GET_NR_MMU_PAGES:
2960                 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
2961                 break;
2962         case KVM_GET_DIRTY_LOG: {
2963                 struct kvm_dirty_log log;
2964
2965                 r = -EFAULT;
2966                 if (copy_from_user(&log, argp, sizeof log))
2967                         goto out;
2968                 r = kvm_vm_ioctl_get_dirty_log(kvm, &log);
2969                 if (r)
2970                         goto out;
2971                 break;
2972         }
2973         case KVM_SET_MEMORY_ALIAS: {
2974                 struct kvm_memory_alias alias;
2975
2976                 r = -EFAULT;
2977                 if (copy_from_user(&alias, argp, sizeof alias))
2978                         goto out;
2979                 r = kvm_vm_ioctl_set_memory_alias(kvm, &alias);
2980                 if (r)
2981                         goto out;
2982                 break;
2983         }
2984         case KVM_CREATE_IRQCHIP:
2985                 r = -ENOMEM;
2986                 kvm->vpic = kvm_create_pic(kvm);
2987                 if (kvm->vpic) {
2988                         r = kvm_ioapic_init(kvm);
2989                         if (r) {
2990                                 kfree(kvm->vpic);
2991                                 kvm->vpic = NULL;
2992                                 goto out;
2993                         }
2994                 } else
2995                         goto out;
2996                 break;
2997         case KVM_IRQ_LINE: {
2998                 struct kvm_irq_level irq_event;
2999
3000                 r = -EFAULT;
3001                 if (copy_from_user(&irq_event, argp, sizeof irq_event))
3002                         goto out;
3003                 if (irqchip_in_kernel(kvm)) {
3004                         mutex_lock(&kvm->lock);
3005                         if (irq_event.irq < 16)
3006                                 kvm_pic_set_irq(pic_irqchip(kvm),
3007                                         irq_event.irq,
3008                                         irq_event.level);
3009                         kvm_ioapic_set_irq(kvm->vioapic,
3010                                         irq_event.irq,
3011                                         irq_event.level);
3012                         mutex_unlock(&kvm->lock);
3013                         r = 0;
3014                 }
3015                 break;
3016         }
3017         case KVM_GET_IRQCHIP: {
3018                 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
3019                 struct kvm_irqchip chip;
3020
3021                 r = -EFAULT;
3022                 if (copy_from_user(&chip, argp, sizeof chip))
3023                         goto out;
3024                 r = -ENXIO;
3025                 if (!irqchip_in_kernel(kvm))
3026                         goto out;
3027                 r = kvm_vm_ioctl_get_irqchip(kvm, &chip);
3028                 if (r)
3029                         goto out;
3030                 r = -EFAULT;
3031                 if (copy_to_user(argp, &chip, sizeof chip))
3032                         goto out;
3033                 r = 0;
3034                 break;
3035         }
3036         case KVM_SET_IRQCHIP: {
3037                 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
3038                 struct kvm_irqchip chip;
3039
3040                 r = -EFAULT;
3041                 if (copy_from_user(&chip, argp, sizeof chip))
3042                         goto out;
3043                 r = -ENXIO;
3044                 if (!irqchip_in_kernel(kvm))
3045                         goto out;
3046                 r = kvm_vm_ioctl_set_irqchip(kvm, &chip);
3047                 if (r)
3048                         goto out;
3049                 r = 0;
3050                 break;
3051         }
3052         default:
3053                 ;
3054         }
3055 out:
3056         return r;
3057 }
3058
3059 static struct page *kvm_vm_nopage(struct vm_area_struct *vma,
3060                                   unsigned long address,
3061                                   int *type)
3062 {
3063         struct kvm *kvm = vma->vm_file->private_data;
3064         unsigned long pgoff;
3065         struct page *page;
3066
3067         pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
3068         page = gfn_to_page(kvm, pgoff);
3069         if (!page)
3070                 return NOPAGE_SIGBUS;
3071         get_page(page);
3072         if (type != NULL)
3073                 *type = VM_FAULT_MINOR;
3074
3075         return page;
3076 }
3077
3078 static struct vm_operations_struct kvm_vm_vm_ops = {
3079         .nopage = kvm_vm_nopage,
3080 };
3081
3082 static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
3083 {
3084         vma->vm_ops = &kvm_vm_vm_ops;
3085         return 0;
3086 }
3087
3088 static struct file_operations kvm_vm_fops = {
3089         .release        = kvm_vm_release,
3090         .unlocked_ioctl = kvm_vm_ioctl,
3091         .compat_ioctl   = kvm_vm_ioctl,
3092         .mmap           = kvm_vm_mmap,
3093 };
3094
3095 static int kvm_dev_ioctl_create_vm(void)
3096 {
3097         int fd, r;
3098         struct inode *inode;
3099         struct file *file;
3100         struct kvm *kvm;
3101
3102         kvm = kvm_create_vm();
3103         if (IS_ERR(kvm))
3104                 return PTR_ERR(kvm);
3105         r = anon_inode_getfd(&fd, &inode, &file, "kvm-vm", &kvm_vm_fops, kvm);
3106         if (r) {
3107                 kvm_destroy_vm(kvm);
3108                 return r;
3109         }
3110
3111         kvm->filp = file;
3112
3113         return fd;
3114 }
3115
3116 static long kvm_dev_ioctl(struct file *filp,
3117                           unsigned int ioctl, unsigned long arg)
3118 {
3119         void __user *argp = (void __user *)arg;
3120         long r = -EINVAL;
3121
3122         switch (ioctl) {
3123         case KVM_GET_API_VERSION:
3124                 r = -EINVAL;
3125                 if (arg)
3126                         goto out;
3127                 r = KVM_API_VERSION;
3128                 break;
3129         case KVM_CREATE_VM:
3130                 r = -EINVAL;
3131                 if (arg)
3132                         goto out;
3133                 r = kvm_dev_ioctl_create_vm();
3134                 break;
3135         case KVM_CHECK_EXTENSION: {
3136                 int ext = (long)argp;
3137
3138                 switch (ext) {
3139                 case KVM_CAP_IRQCHIP:
3140                 case KVM_CAP_HLT:
3141                 case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
3142                 case KVM_CAP_USER_MEMORY:
3143                         r = 1;
3144                         break;
3145                 default:
3146                         r = 0;
3147                         break;
3148                 }
3149                 break;
3150         }
3151         case KVM_GET_VCPU_MMAP_SIZE:
3152                 r = -EINVAL;
3153                 if (arg)
3154                         goto out;
3155                 r = 2 * PAGE_SIZE;
3156                 break;
3157         default:
3158                 return kvm_arch_dev_ioctl(filp, ioctl, arg);
3159         }
3160 out:
3161         return r;
3162 }
3163
3164 static struct file_operations kvm_chardev_ops = {
3165         .unlocked_ioctl = kvm_dev_ioctl,
3166         .compat_ioctl   = kvm_dev_ioctl,
3167 };
3168
3169 static struct miscdevice kvm_dev = {
3170         KVM_MINOR,
3171         "kvm",
3172         &kvm_chardev_ops,
3173 };
3174
3175 /*
3176  * Make sure that a cpu that is being hot-unplugged does not have any vcpus
3177  * cached on it.
3178  */
3179 static void decache_vcpus_on_cpu(int cpu)
3180 {
3181         struct kvm *vm;
3182         struct kvm_vcpu *vcpu;
3183         int i;
3184
3185         spin_lock(&kvm_lock);
3186         list_for_each_entry(vm, &vm_list, vm_list)
3187                 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
3188                         vcpu = vm->vcpus[i];
3189                         if (!vcpu)
3190                                 continue;
3191                         /*
3192                          * If the vcpu is locked, then it is running on some
3193                          * other cpu and therefore it is not cached on the
3194                          * cpu in question.
3195                          *
3196                          * If it's not locked, check the last cpu it executed
3197                          * on.
3198                          */
3199                         if (mutex_trylock(&vcpu->mutex)) {
3200                                 if (vcpu->cpu == cpu) {
3201                                         kvm_x86_ops->vcpu_decache(vcpu);
3202                                         vcpu->cpu = -1;
3203                                 }
3204                                 mutex_unlock(&vcpu->mutex);
3205                         }
3206                 }
3207         spin_unlock(&kvm_lock);
3208 }
3209
3210 static void hardware_enable(void *junk)
3211 {
3212         int cpu = raw_smp_processor_id();
3213
3214         if (cpu_isset(cpu, cpus_hardware_enabled))
3215                 return;
3216         cpu_set(cpu, cpus_hardware_enabled);
3217         kvm_x86_ops->hardware_enable(NULL);
3218 }
3219
3220 static void hardware_disable(void *junk)
3221 {
3222         int cpu = raw_smp_processor_id();
3223
3224         if (!cpu_isset(cpu, cpus_hardware_enabled))
3225                 return;
3226         cpu_clear(cpu, cpus_hardware_enabled);
3227         decache_vcpus_on_cpu(cpu);
3228         kvm_x86_ops->hardware_disable(NULL);
3229 }
3230
3231 static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
3232                            void *v)
3233 {
3234         int cpu = (long)v;
3235
3236         switch (val) {
3237         case CPU_DYING:
3238         case CPU_DYING_FROZEN:
3239                 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
3240                        cpu);
3241                 hardware_disable(NULL);
3242                 break;
3243         case CPU_UP_CANCELED:
3244         case CPU_UP_CANCELED_FROZEN:
3245                 printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
3246                        cpu);
3247                 smp_call_function_single(cpu, hardware_disable, NULL, 0, 1);
3248                 break;
3249         case CPU_ONLINE:
3250         case CPU_ONLINE_FROZEN:
3251                 printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
3252                        cpu);
3253                 smp_call_function_single(cpu, hardware_enable, NULL, 0, 1);
3254                 break;
3255         }
3256         return NOTIFY_OK;
3257 }
3258
3259 static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
3260                       void *v)
3261 {
3262         if (val == SYS_RESTART) {
3263                 /*
3264                  * Some (well, at least mine) BIOSes hang on reboot if
3265                  * in vmx root mode.
3266                  */
3267                 printk(KERN_INFO "kvm: exiting hardware virtualization\n");
3268                 on_each_cpu(hardware_disable, NULL, 0, 1);
3269         }
3270         return NOTIFY_OK;
3271 }
3272
3273 static struct notifier_block kvm_reboot_notifier = {
3274         .notifier_call = kvm_reboot,
3275         .priority = 0,
3276 };
3277
3278 void kvm_io_bus_init(struct kvm_io_bus *bus)
3279 {
3280         memset(bus, 0, sizeof(*bus));
3281 }
3282
3283 void kvm_io_bus_destroy(struct kvm_io_bus *bus)
3284 {
3285         int i;
3286
3287         for (i = 0; i < bus->dev_count; i++) {
3288                 struct kvm_io_device *pos = bus->devs[i];
3289
3290                 kvm_iodevice_destructor(pos);
3291         }
3292 }
3293
3294 struct kvm_io_device *kvm_io_bus_find_dev(struct kvm_io_bus *bus, gpa_t addr)
3295 {
3296         int i;
3297
3298         for (i = 0; i < bus->dev_count; i++) {
3299                 struct kvm_io_device *pos = bus->devs[i];
3300
3301                 if (pos->in_range(pos, addr))
3302                         return pos;
3303         }
3304
3305         return NULL;
3306 }
3307
3308 void kvm_io_bus_register_dev(struct kvm_io_bus *bus, struct kvm_io_device *dev)
3309 {
3310         BUG_ON(bus->dev_count > (NR_IOBUS_DEVS-1));
3311
3312         bus->devs[bus->dev_count++] = dev;
3313 }
3314
3315 static struct notifier_block kvm_cpu_notifier = {
3316         .notifier_call = kvm_cpu_hotplug,
3317         .priority = 20, /* must be > scheduler priority */
3318 };
3319
3320 static u64 stat_get(void *_offset)
3321 {
3322         unsigned offset = (long)_offset;
3323         u64 total = 0;
3324         struct kvm *kvm;
3325         struct kvm_vcpu *vcpu;
3326         int i;
3327
3328         spin_lock(&kvm_lock);
3329         list_for_each_entry(kvm, &vm_list, vm_list)
3330                 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
3331                         vcpu = kvm->vcpus[i];
3332                         if (vcpu)
3333                                 total += *(u32 *)((void *)vcpu + offset);
3334                 }
3335         spin_unlock(&kvm_lock);
3336         return total;
3337 }
3338
3339 DEFINE_SIMPLE_ATTRIBUTE(stat_fops, stat_get, NULL, "%llu\n");
3340
3341 static __init void kvm_init_debug(void)
3342 {
3343         struct kvm_stats_debugfs_item *p;
3344
3345         debugfs_dir = debugfs_create_dir("kvm", NULL);
3346         for (p = debugfs_entries; p->name; ++p)
3347                 p->dentry = debugfs_create_file(p->name, 0444, debugfs_dir,
3348                                                 (void *)(long)p->offset,
3349                                                 &stat_fops);
3350 }
3351
3352 static void kvm_exit_debug(void)
3353 {
3354         struct kvm_stats_debugfs_item *p;
3355
3356         for (p = debugfs_entries; p->name; ++p)
3357                 debugfs_remove(p->dentry);
3358         debugfs_remove(debugfs_dir);
3359 }
3360
3361 static int kvm_suspend(struct sys_device *dev, pm_message_t state)
3362 {
3363         hardware_disable(NULL);
3364         return 0;
3365 }
3366
3367 static int kvm_resume(struct sys_device *dev)
3368 {
3369         hardware_enable(NULL);
3370         return 0;
3371 }
3372
3373 static struct sysdev_class kvm_sysdev_class = {
3374         .name = "kvm",
3375         .suspend = kvm_suspend,
3376         .resume = kvm_resume,
3377 };
3378
3379 static struct sys_device kvm_sysdev = {
3380         .id = 0,
3381         .cls = &kvm_sysdev_class,
3382 };
3383
3384 hpa_t bad_page_address;
3385
3386 static inline
3387 struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
3388 {
3389         return container_of(pn, struct kvm_vcpu, preempt_notifier);
3390 }
3391
3392 static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
3393 {
3394         struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
3395
3396         kvm_x86_ops->vcpu_load(vcpu, cpu);
3397 }
3398
3399 static void kvm_sched_out(struct preempt_notifier *pn,
3400                           struct task_struct *next)
3401 {
3402         struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
3403
3404         kvm_x86_ops->vcpu_put(vcpu);
3405 }
3406
3407 int kvm_init_x86(struct kvm_x86_ops *ops, unsigned int vcpu_size,
3408                   struct module *module)
3409 {
3410         int r;
3411         int cpu;
3412
3413         if (kvm_x86_ops) {
3414                 printk(KERN_ERR "kvm: already loaded the other module\n");
3415                 return -EEXIST;
3416         }
3417
3418         if (!ops->cpu_has_kvm_support()) {
3419                 printk(KERN_ERR "kvm: no hardware support\n");
3420                 return -EOPNOTSUPP;
3421         }
3422         if (ops->disabled_by_bios()) {
3423                 printk(KERN_ERR "kvm: disabled by bios\n");
3424                 return -EOPNOTSUPP;
3425         }
3426
3427         kvm_x86_ops = ops;
3428
3429         r = kvm_x86_ops->hardware_setup();
3430         if (r < 0)
3431                 goto out;
3432
3433         for_each_online_cpu(cpu) {
3434                 smp_call_function_single(cpu,
3435                                 kvm_x86_ops->check_processor_compatibility,
3436                                 &r, 0, 1);
3437                 if (r < 0)
3438                         goto out_free_0;
3439         }
3440
3441         on_each_cpu(hardware_enable, NULL, 0, 1);
3442         r = register_cpu_notifier(&kvm_cpu_notifier);
3443         if (r)
3444                 goto out_free_1;
3445         register_reboot_notifier(&kvm_reboot_notifier);
3446
3447         r = sysdev_class_register(&kvm_sysdev_class);
3448         if (r)
3449                 goto out_free_2;
3450
3451         r = sysdev_register(&kvm_sysdev);
3452         if (r)
3453                 goto out_free_3;
3454
3455         /* A kmem cache lets us meet the alignment requirements of fx_save. */
3456         kvm_vcpu_cache = kmem_cache_create("kvm_vcpu", vcpu_size,
3457                                            __alignof__(struct kvm_vcpu), 0, 0);
3458         if (!kvm_vcpu_cache) {
3459                 r = -ENOMEM;
3460                 goto out_free_4;
3461         }
3462
3463         kvm_chardev_ops.owner = module;
3464
3465         r = misc_register(&kvm_dev);
3466         if (r) {
3467                 printk(KERN_ERR "kvm: misc device register failed\n");
3468                 goto out_free;
3469         }
3470
3471         kvm_preempt_ops.sched_in = kvm_sched_in;
3472         kvm_preempt_ops.sched_out = kvm_sched_out;
3473
3474         kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
3475
3476         return 0;
3477
3478 out_free:
3479         kmem_cache_destroy(kvm_vcpu_cache);
3480 out_free_4:
3481         sysdev_unregister(&kvm_sysdev);
3482 out_free_3:
3483         sysdev_class_unregister(&kvm_sysdev_class);
3484 out_free_2:
3485         unregister_reboot_notifier(&kvm_reboot_notifier);
3486         unregister_cpu_notifier(&kvm_cpu_notifier);
3487 out_free_1:
3488         on_each_cpu(hardware_disable, NULL, 0, 1);
3489 out_free_0:
3490         kvm_x86_ops->hardware_unsetup();
3491 out:
3492         kvm_x86_ops = NULL;
3493         return r;
3494 }
3495 EXPORT_SYMBOL_GPL(kvm_init_x86);
3496
3497 void kvm_exit_x86(void)
3498 {
3499         misc_deregister(&kvm_dev);
3500         kmem_cache_destroy(kvm_vcpu_cache);
3501         sysdev_unregister(&kvm_sysdev);
3502         sysdev_class_unregister(&kvm_sysdev_class);
3503         unregister_reboot_notifier(&kvm_reboot_notifier);
3504         unregister_cpu_notifier(&kvm_cpu_notifier);
3505         on_each_cpu(hardware_disable, NULL, 0, 1);
3506         kvm_x86_ops->hardware_unsetup();
3507         kvm_x86_ops = NULL;
3508 }
3509 EXPORT_SYMBOL_GPL(kvm_exit_x86);
3510
3511 static __init int kvm_init(void)
3512 {
3513         static struct page *bad_page;
3514         int r;
3515
3516         r = kvm_mmu_module_init();
3517         if (r)
3518                 goto out4;
3519
3520         kvm_init_debug();
3521
3522         kvm_arch_init();
3523
3524         bad_page = alloc_page(GFP_KERNEL);
3525
3526         if (bad_page == NULL) {
3527                 r = -ENOMEM;
3528                 goto out;
3529         }
3530
3531         bad_page_address = page_to_pfn(bad_page) << PAGE_SHIFT;
3532         memset(__va(bad_page_address), 0, PAGE_SIZE);
3533
3534         return 0;
3535
3536 out:
3537         kvm_exit_debug();
3538         kvm_mmu_module_exit();
3539 out4:
3540         return r;
3541 }
3542
3543 static __exit void kvm_exit(void)
3544 {
3545         kvm_exit_debug();
3546         __free_page(pfn_to_page(bad_page_address >> PAGE_SHIFT));
3547         kvm_mmu_module_exit();
3548 }
3549
3550 module_init(kvm_init)
3551 module_exit(kvm_exit)