]> err.no Git - linux-2.6/blob - drivers/kvm/vmx.c
6270df58e05506d1dfe26f89a7e7ebbe2b3d2c3e
[linux-2.6] / drivers / kvm / vmx.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * Copyright (C) 2006 Qumranet, Inc.
8  *
9  * Authors:
10  *   Avi Kivity   <avi@qumranet.com>
11  *   Yaniv Kamay  <yaniv@qumranet.com>
12  *
13  * This work is licensed under the terms of the GNU GPL, version 2.  See
14  * the COPYING file in the top-level directory.
15  *
16  */
17
18 #include "kvm.h"
19 #include "vmx.h"
20 #include "kvm_vmx.h"
21 #include <linux/module.h>
22 #include <linux/kernel.h>
23 #include <linux/mm.h>
24 #include <linux/highmem.h>
25 #include <linux/profile.h>
26 #include <asm/io.h>
27 #include <asm/desc.h>
28
29 #include "segment_descriptor.h"
30
31 MODULE_AUTHOR("Qumranet");
32 MODULE_LICENSE("GPL");
33
34 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
35 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
36
37 #ifdef CONFIG_X86_64
38 #define HOST_IS_64 1
39 #else
40 #define HOST_IS_64 0
41 #endif
42
43 static struct vmcs_descriptor {
44         int size;
45         int order;
46         u32 revision_id;
47 } vmcs_descriptor;
48
49 #define VMX_SEGMENT_FIELD(seg)                                  \
50         [VCPU_SREG_##seg] = {                                   \
51                 .selector = GUEST_##seg##_SELECTOR,             \
52                 .base = GUEST_##seg##_BASE,                     \
53                 .limit = GUEST_##seg##_LIMIT,                   \
54                 .ar_bytes = GUEST_##seg##_AR_BYTES,             \
55         }
56
57 static struct kvm_vmx_segment_field {
58         unsigned selector;
59         unsigned base;
60         unsigned limit;
61         unsigned ar_bytes;
62 } kvm_vmx_segment_fields[] = {
63         VMX_SEGMENT_FIELD(CS),
64         VMX_SEGMENT_FIELD(DS),
65         VMX_SEGMENT_FIELD(ES),
66         VMX_SEGMENT_FIELD(FS),
67         VMX_SEGMENT_FIELD(GS),
68         VMX_SEGMENT_FIELD(SS),
69         VMX_SEGMENT_FIELD(TR),
70         VMX_SEGMENT_FIELD(LDTR),
71 };
72
73 static const u32 vmx_msr_index[] = {
74 #ifdef CONFIG_X86_64
75         MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE,
76 #endif
77         MSR_EFER, MSR_K6_STAR,
78 };
79 #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
80
81 #ifdef CONFIG_X86_64
82 static unsigned msr_offset_kernel_gs_base;
83 #define NR_64BIT_MSRS 4
84 #else
85 #define NR_64BIT_MSRS 0
86 #endif
87
88 static inline int is_page_fault(u32 intr_info)
89 {
90         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
91                              INTR_INFO_VALID_MASK)) ==
92                 (INTR_TYPE_EXCEPTION | PF_VECTOR | INTR_INFO_VALID_MASK);
93 }
94
95 static inline int is_external_interrupt(u32 intr_info)
96 {
97         return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
98                 == (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
99 }
100
101 static struct vmx_msr_entry *find_msr_entry(struct kvm_vcpu *vcpu, u32 msr)
102 {
103         int i;
104
105         for (i = 0; i < vcpu->nmsrs; ++i)
106                 if (vcpu->guest_msrs[i].index == msr)
107                         return &vcpu->guest_msrs[i];
108         return NULL;
109 }
110
111 static void vmcs_clear(struct vmcs *vmcs)
112 {
113         u64 phys_addr = __pa(vmcs);
114         u8 error;
115
116         asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0"
117                       : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
118                       : "cc", "memory");
119         if (error)
120                 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
121                        vmcs, phys_addr);
122 }
123
124 static void __vcpu_clear(void *arg)
125 {
126         struct kvm_vcpu *vcpu = arg;
127         int cpu = raw_smp_processor_id();
128
129         if (vcpu->cpu == cpu)
130                 vmcs_clear(vcpu->vmcs);
131         if (per_cpu(current_vmcs, cpu) == vcpu->vmcs)
132                 per_cpu(current_vmcs, cpu) = NULL;
133 }
134
135 static void vcpu_clear(struct kvm_vcpu *vcpu)
136 {
137         if (vcpu->cpu != raw_smp_processor_id() && vcpu->cpu != -1)
138                 smp_call_function_single(vcpu->cpu, __vcpu_clear, vcpu, 0, 1);
139         else
140                 __vcpu_clear(vcpu);
141         vcpu->launched = 0;
142 }
143
144 static unsigned long vmcs_readl(unsigned long field)
145 {
146         unsigned long value;
147
148         asm volatile (ASM_VMX_VMREAD_RDX_RAX
149                       : "=a"(value) : "d"(field) : "cc");
150         return value;
151 }
152
153 static u16 vmcs_read16(unsigned long field)
154 {
155         return vmcs_readl(field);
156 }
157
158 static u32 vmcs_read32(unsigned long field)
159 {
160         return vmcs_readl(field);
161 }
162
163 static u64 vmcs_read64(unsigned long field)
164 {
165 #ifdef CONFIG_X86_64
166         return vmcs_readl(field);
167 #else
168         return vmcs_readl(field) | ((u64)vmcs_readl(field+1) << 32);
169 #endif
170 }
171
172 static noinline void vmwrite_error(unsigned long field, unsigned long value)
173 {
174         printk(KERN_ERR "vmwrite error: reg %lx value %lx (err %d)\n",
175                field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
176         dump_stack();
177 }
178
179 static void vmcs_writel(unsigned long field, unsigned long value)
180 {
181         u8 error;
182
183         asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0"
184                        : "=q"(error) : "a"(value), "d"(field) : "cc" );
185         if (unlikely(error))
186                 vmwrite_error(field, value);
187 }
188
189 static void vmcs_write16(unsigned long field, u16 value)
190 {
191         vmcs_writel(field, value);
192 }
193
194 static void vmcs_write32(unsigned long field, u32 value)
195 {
196         vmcs_writel(field, value);
197 }
198
199 static void vmcs_write64(unsigned long field, u64 value)
200 {
201 #ifdef CONFIG_X86_64
202         vmcs_writel(field, value);
203 #else
204         vmcs_writel(field, value);
205         asm volatile ("");
206         vmcs_writel(field+1, value >> 32);
207 #endif
208 }
209
210 /*
211  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
212  * vcpu mutex is already taken.
213  */
214 static void vmx_vcpu_load(struct kvm_vcpu *vcpu)
215 {
216         u64 phys_addr = __pa(vcpu->vmcs);
217         int cpu;
218
219         cpu = get_cpu();
220
221         if (vcpu->cpu != cpu)
222                 vcpu_clear(vcpu);
223
224         if (per_cpu(current_vmcs, cpu) != vcpu->vmcs) {
225                 u8 error;
226
227                 per_cpu(current_vmcs, cpu) = vcpu->vmcs;
228                 asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0"
229                               : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
230                               : "cc");
231                 if (error)
232                         printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
233                                vcpu->vmcs, phys_addr);
234         }
235
236         if (vcpu->cpu != cpu) {
237                 struct descriptor_table dt;
238                 unsigned long sysenter_esp;
239
240                 vcpu->cpu = cpu;
241                 /*
242                  * Linux uses per-cpu TSS and GDT, so set these when switching
243                  * processors.
244                  */
245                 vmcs_writel(HOST_TR_BASE, read_tr_base()); /* 22.2.4 */
246                 get_gdt(&dt);
247                 vmcs_writel(HOST_GDTR_BASE, dt.base);   /* 22.2.4 */
248
249                 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
250                 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
251         }
252 }
253
254 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
255 {
256         put_cpu();
257 }
258
259 static void vmx_vcpu_decache(struct kvm_vcpu *vcpu)
260 {
261         vcpu_clear(vcpu);
262 }
263
264 static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
265 {
266         return vmcs_readl(GUEST_RFLAGS);
267 }
268
269 static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
270 {
271         vmcs_writel(GUEST_RFLAGS, rflags);
272 }
273
274 static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
275 {
276         unsigned long rip;
277         u32 interruptibility;
278
279         rip = vmcs_readl(GUEST_RIP);
280         rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
281         vmcs_writel(GUEST_RIP, rip);
282
283         /*
284          * We emulated an instruction, so temporary interrupt blocking
285          * should be removed, if set.
286          */
287         interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
288         if (interruptibility & 3)
289                 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
290                              interruptibility & ~3);
291         vcpu->interrupt_window_open = 1;
292 }
293
294 static void vmx_inject_gp(struct kvm_vcpu *vcpu, unsigned error_code)
295 {
296         printk(KERN_DEBUG "inject_general_protection: rip 0x%lx\n",
297                vmcs_readl(GUEST_RIP));
298         vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
299         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
300                      GP_VECTOR |
301                      INTR_TYPE_EXCEPTION |
302                      INTR_INFO_DELIEVER_CODE_MASK |
303                      INTR_INFO_VALID_MASK);
304 }
305
306 /*
307  * Set up the vmcs to automatically save and restore system
308  * msrs.  Don't touch the 64-bit msrs if the guest is in legacy
309  * mode, as fiddling with msrs is very expensive.
310  */
311 static void setup_msrs(struct kvm_vcpu *vcpu)
312 {
313         int nr_skip, nr_good_msrs;
314
315         if (is_long_mode(vcpu))
316                 nr_skip = NR_BAD_MSRS;
317         else
318                 nr_skip = NR_64BIT_MSRS;
319         nr_good_msrs = vcpu->nmsrs - nr_skip;
320
321         vmcs_writel(VM_ENTRY_MSR_LOAD_ADDR,
322                     virt_to_phys(vcpu->guest_msrs + nr_skip));
323         vmcs_writel(VM_EXIT_MSR_STORE_ADDR,
324                     virt_to_phys(vcpu->guest_msrs + nr_skip));
325         vmcs_writel(VM_EXIT_MSR_LOAD_ADDR,
326                     virt_to_phys(vcpu->host_msrs + nr_skip));
327         vmcs_write32(VM_EXIT_MSR_STORE_COUNT, nr_good_msrs); /* 22.2.2 */
328         vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, nr_good_msrs);  /* 22.2.2 */
329         vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, nr_good_msrs); /* 22.2.2 */
330 }
331
332 /*
333  * reads and returns guest's timestamp counter "register"
334  * guest_tsc = host_tsc + tsc_offset    -- 21.3
335  */
336 static u64 guest_read_tsc(void)
337 {
338         u64 host_tsc, tsc_offset;
339
340         rdtscll(host_tsc);
341         tsc_offset = vmcs_read64(TSC_OFFSET);
342         return host_tsc + tsc_offset;
343 }
344
345 /*
346  * writes 'guest_tsc' into guest's timestamp counter "register"
347  * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc
348  */
349 static void guest_write_tsc(u64 guest_tsc)
350 {
351         u64 host_tsc;
352
353         rdtscll(host_tsc);
354         vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc);
355 }
356
357 static void reload_tss(void)
358 {
359 #ifndef CONFIG_X86_64
360
361         /*
362          * VT restores TR but not its size.  Useless.
363          */
364         struct descriptor_table gdt;
365         struct segment_descriptor *descs;
366
367         get_gdt(&gdt);
368         descs = (void *)gdt.base;
369         descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
370         load_TR_desc();
371 #endif
372 }
373
374 /*
375  * Reads an msr value (of 'msr_index') into 'pdata'.
376  * Returns 0 on success, non-0 otherwise.
377  * Assumes vcpu_load() was already called.
378  */
379 static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
380 {
381         u64 data;
382         struct vmx_msr_entry *msr;
383
384         if (!pdata) {
385                 printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
386                 return -EINVAL;
387         }
388
389         switch (msr_index) {
390 #ifdef CONFIG_X86_64
391         case MSR_FS_BASE:
392                 data = vmcs_readl(GUEST_FS_BASE);
393                 break;
394         case MSR_GS_BASE:
395                 data = vmcs_readl(GUEST_GS_BASE);
396                 break;
397         case MSR_EFER:
398                 return kvm_get_msr_common(vcpu, msr_index, pdata);
399 #endif
400         case MSR_IA32_TIME_STAMP_COUNTER:
401                 data = guest_read_tsc();
402                 break;
403         case MSR_IA32_SYSENTER_CS:
404                 data = vmcs_read32(GUEST_SYSENTER_CS);
405                 break;
406         case MSR_IA32_SYSENTER_EIP:
407                 data = vmcs_readl(GUEST_SYSENTER_EIP);
408                 break;
409         case MSR_IA32_SYSENTER_ESP:
410                 data = vmcs_readl(GUEST_SYSENTER_ESP);
411                 break;
412         default:
413                 msr = find_msr_entry(vcpu, msr_index);
414                 if (msr) {
415                         data = msr->data;
416                         break;
417                 }
418                 return kvm_get_msr_common(vcpu, msr_index, pdata);
419         }
420
421         *pdata = data;
422         return 0;
423 }
424
425 /*
426  * Writes msr value into into the appropriate "register".
427  * Returns 0 on success, non-0 otherwise.
428  * Assumes vcpu_load() was already called.
429  */
430 static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
431 {
432         struct vmx_msr_entry *msr;
433         switch (msr_index) {
434 #ifdef CONFIG_X86_64
435         case MSR_EFER:
436                 return kvm_set_msr_common(vcpu, msr_index, data);
437         case MSR_FS_BASE:
438                 vmcs_writel(GUEST_FS_BASE, data);
439                 break;
440         case MSR_GS_BASE:
441                 vmcs_writel(GUEST_GS_BASE, data);
442                 break;
443 #endif
444         case MSR_IA32_SYSENTER_CS:
445                 vmcs_write32(GUEST_SYSENTER_CS, data);
446                 break;
447         case MSR_IA32_SYSENTER_EIP:
448                 vmcs_writel(GUEST_SYSENTER_EIP, data);
449                 break;
450         case MSR_IA32_SYSENTER_ESP:
451                 vmcs_writel(GUEST_SYSENTER_ESP, data);
452                 break;
453         case MSR_IA32_TIME_STAMP_COUNTER:
454                 guest_write_tsc(data);
455                 break;
456         default:
457                 msr = find_msr_entry(vcpu, msr_index);
458                 if (msr) {
459                         msr->data = data;
460                         break;
461                 }
462                 return kvm_set_msr_common(vcpu, msr_index, data);
463                 msr->data = data;
464                 break;
465         }
466
467         return 0;
468 }
469
470 /*
471  * Sync the rsp and rip registers into the vcpu structure.  This allows
472  * registers to be accessed by indexing vcpu->regs.
473  */
474 static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu)
475 {
476         vcpu->regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
477         vcpu->rip = vmcs_readl(GUEST_RIP);
478 }
479
480 /*
481  * Syncs rsp and rip back into the vmcs.  Should be called after possible
482  * modification.
483  */
484 static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu)
485 {
486         vmcs_writel(GUEST_RSP, vcpu->regs[VCPU_REGS_RSP]);
487         vmcs_writel(GUEST_RIP, vcpu->rip);
488 }
489
490 static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
491 {
492         unsigned long dr7 = 0x400;
493         u32 exception_bitmap;
494         int old_singlestep;
495
496         exception_bitmap = vmcs_read32(EXCEPTION_BITMAP);
497         old_singlestep = vcpu->guest_debug.singlestep;
498
499         vcpu->guest_debug.enabled = dbg->enabled;
500         if (vcpu->guest_debug.enabled) {
501                 int i;
502
503                 dr7 |= 0x200;  /* exact */
504                 for (i = 0; i < 4; ++i) {
505                         if (!dbg->breakpoints[i].enabled)
506                                 continue;
507                         vcpu->guest_debug.bp[i] = dbg->breakpoints[i].address;
508                         dr7 |= 2 << (i*2);    /* global enable */
509                         dr7 |= 0 << (i*4+16); /* execution breakpoint */
510                 }
511
512                 exception_bitmap |= (1u << 1);  /* Trap debug exceptions */
513
514                 vcpu->guest_debug.singlestep = dbg->singlestep;
515         } else {
516                 exception_bitmap &= ~(1u << 1); /* Ignore debug exceptions */
517                 vcpu->guest_debug.singlestep = 0;
518         }
519
520         if (old_singlestep && !vcpu->guest_debug.singlestep) {
521                 unsigned long flags;
522
523                 flags = vmcs_readl(GUEST_RFLAGS);
524                 flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
525                 vmcs_writel(GUEST_RFLAGS, flags);
526         }
527
528         vmcs_write32(EXCEPTION_BITMAP, exception_bitmap);
529         vmcs_writel(GUEST_DR7, dr7);
530
531         return 0;
532 }
533
534 static __init int cpu_has_kvm_support(void)
535 {
536         unsigned long ecx = cpuid_ecx(1);
537         return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */
538 }
539
540 static __init int vmx_disabled_by_bios(void)
541 {
542         u64 msr;
543
544         rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
545         return (msr & 5) == 1; /* locked but not enabled */
546 }
547
548 static void hardware_enable(void *garbage)
549 {
550         int cpu = raw_smp_processor_id();
551         u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
552         u64 old;
553
554         rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
555         if ((old & 5) != 5)
556                 /* enable and lock */
557                 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | 5);
558         write_cr4(read_cr4() | CR4_VMXE); /* FIXME: not cpu hotplug safe */
559         asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr)
560                       : "memory", "cc");
561 }
562
563 static void hardware_disable(void *garbage)
564 {
565         asm volatile (ASM_VMX_VMXOFF : : : "cc");
566 }
567
568 static __init void setup_vmcs_descriptor(void)
569 {
570         u32 vmx_msr_low, vmx_msr_high;
571
572         rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
573         vmcs_descriptor.size = vmx_msr_high & 0x1fff;
574         vmcs_descriptor.order = get_order(vmcs_descriptor.size);
575         vmcs_descriptor.revision_id = vmx_msr_low;
576 }
577
578 static struct vmcs *alloc_vmcs_cpu(int cpu)
579 {
580         int node = cpu_to_node(cpu);
581         struct page *pages;
582         struct vmcs *vmcs;
583
584         pages = alloc_pages_node(node, GFP_KERNEL, vmcs_descriptor.order);
585         if (!pages)
586                 return NULL;
587         vmcs = page_address(pages);
588         memset(vmcs, 0, vmcs_descriptor.size);
589         vmcs->revision_id = vmcs_descriptor.revision_id; /* vmcs revision id */
590         return vmcs;
591 }
592
593 static struct vmcs *alloc_vmcs(void)
594 {
595         return alloc_vmcs_cpu(raw_smp_processor_id());
596 }
597
598 static void free_vmcs(struct vmcs *vmcs)
599 {
600         free_pages((unsigned long)vmcs, vmcs_descriptor.order);
601 }
602
603 static __exit void free_kvm_area(void)
604 {
605         int cpu;
606
607         for_each_online_cpu(cpu)
608                 free_vmcs(per_cpu(vmxarea, cpu));
609 }
610
611 extern struct vmcs *alloc_vmcs_cpu(int cpu);
612
613 static __init int alloc_kvm_area(void)
614 {
615         int cpu;
616
617         for_each_online_cpu(cpu) {
618                 struct vmcs *vmcs;
619
620                 vmcs = alloc_vmcs_cpu(cpu);
621                 if (!vmcs) {
622                         free_kvm_area();
623                         return -ENOMEM;
624                 }
625
626                 per_cpu(vmxarea, cpu) = vmcs;
627         }
628         return 0;
629 }
630
631 static __init int hardware_setup(void)
632 {
633         setup_vmcs_descriptor();
634         return alloc_kvm_area();
635 }
636
637 static __exit void hardware_unsetup(void)
638 {
639         free_kvm_area();
640 }
641
642 static void update_exception_bitmap(struct kvm_vcpu *vcpu)
643 {
644         if (vcpu->rmode.active)
645                 vmcs_write32(EXCEPTION_BITMAP, ~0);
646         else
647                 vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR);
648 }
649
650 static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
651 {
652         struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
653
654         if (vmcs_readl(sf->base) == save->base && (save->base & AR_S_MASK)) {
655                 vmcs_write16(sf->selector, save->selector);
656                 vmcs_writel(sf->base, save->base);
657                 vmcs_write32(sf->limit, save->limit);
658                 vmcs_write32(sf->ar_bytes, save->ar);
659         } else {
660                 u32 dpl = (vmcs_read16(sf->selector) & SELECTOR_RPL_MASK)
661                         << AR_DPL_SHIFT;
662                 vmcs_write32(sf->ar_bytes, 0x93 | dpl);
663         }
664 }
665
666 static void enter_pmode(struct kvm_vcpu *vcpu)
667 {
668         unsigned long flags;
669
670         vcpu->rmode.active = 0;
671
672         vmcs_writel(GUEST_TR_BASE, vcpu->rmode.tr.base);
673         vmcs_write32(GUEST_TR_LIMIT, vcpu->rmode.tr.limit);
674         vmcs_write32(GUEST_TR_AR_BYTES, vcpu->rmode.tr.ar);
675
676         flags = vmcs_readl(GUEST_RFLAGS);
677         flags &= ~(IOPL_MASK | X86_EFLAGS_VM);
678         flags |= (vcpu->rmode.save_iopl << IOPL_SHIFT);
679         vmcs_writel(GUEST_RFLAGS, flags);
680
681         vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~CR4_VME_MASK) |
682                         (vmcs_readl(CR4_READ_SHADOW) & CR4_VME_MASK));
683
684         update_exception_bitmap(vcpu);
685
686         fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->rmode.es);
687         fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->rmode.ds);
688         fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->rmode.gs);
689         fix_pmode_dataseg(VCPU_SREG_FS, &vcpu->rmode.fs);
690
691         vmcs_write16(GUEST_SS_SELECTOR, 0);
692         vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
693
694         vmcs_write16(GUEST_CS_SELECTOR,
695                      vmcs_read16(GUEST_CS_SELECTOR) & ~SELECTOR_RPL_MASK);
696         vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
697 }
698
699 static int rmode_tss_base(struct kvm* kvm)
700 {
701         gfn_t base_gfn = kvm->memslots[0].base_gfn + kvm->memslots[0].npages - 3;
702         return base_gfn << PAGE_SHIFT;
703 }
704
705 static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
706 {
707         struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
708
709         save->selector = vmcs_read16(sf->selector);
710         save->base = vmcs_readl(sf->base);
711         save->limit = vmcs_read32(sf->limit);
712         save->ar = vmcs_read32(sf->ar_bytes);
713         vmcs_write16(sf->selector, vmcs_readl(sf->base) >> 4);
714         vmcs_write32(sf->limit, 0xffff);
715         vmcs_write32(sf->ar_bytes, 0xf3);
716 }
717
718 static void enter_rmode(struct kvm_vcpu *vcpu)
719 {
720         unsigned long flags;
721
722         vcpu->rmode.active = 1;
723
724         vcpu->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
725         vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
726
727         vcpu->rmode.tr.limit = vmcs_read32(GUEST_TR_LIMIT);
728         vmcs_write32(GUEST_TR_LIMIT, RMODE_TSS_SIZE - 1);
729
730         vcpu->rmode.tr.ar = vmcs_read32(GUEST_TR_AR_BYTES);
731         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
732
733         flags = vmcs_readl(GUEST_RFLAGS);
734         vcpu->rmode.save_iopl = (flags & IOPL_MASK) >> IOPL_SHIFT;
735
736         flags |= IOPL_MASK | X86_EFLAGS_VM;
737
738         vmcs_writel(GUEST_RFLAGS, flags);
739         vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | CR4_VME_MASK);
740         update_exception_bitmap(vcpu);
741
742         vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4);
743         vmcs_write32(GUEST_SS_LIMIT, 0xffff);
744         vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
745
746         vmcs_write32(GUEST_CS_AR_BYTES, 0xf3);
747         vmcs_write32(GUEST_CS_LIMIT, 0xffff);
748         if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000)
749                 vmcs_writel(GUEST_CS_BASE, 0xf0000);
750         vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4);
751
752         fix_rmode_seg(VCPU_SREG_ES, &vcpu->rmode.es);
753         fix_rmode_seg(VCPU_SREG_DS, &vcpu->rmode.ds);
754         fix_rmode_seg(VCPU_SREG_GS, &vcpu->rmode.gs);
755         fix_rmode_seg(VCPU_SREG_FS, &vcpu->rmode.fs);
756 }
757
758 #ifdef CONFIG_X86_64
759
760 static void enter_lmode(struct kvm_vcpu *vcpu)
761 {
762         u32 guest_tr_ar;
763
764         guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
765         if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
766                 printk(KERN_DEBUG "%s: tss fixup for long mode. \n",
767                        __FUNCTION__);
768                 vmcs_write32(GUEST_TR_AR_BYTES,
769                              (guest_tr_ar & ~AR_TYPE_MASK)
770                              | AR_TYPE_BUSY_64_TSS);
771         }
772
773         vcpu->shadow_efer |= EFER_LMA;
774
775         find_msr_entry(vcpu, MSR_EFER)->data |= EFER_LMA | EFER_LME;
776         vmcs_write32(VM_ENTRY_CONTROLS,
777                      vmcs_read32(VM_ENTRY_CONTROLS)
778                      | VM_ENTRY_CONTROLS_IA32E_MASK);
779 }
780
781 static void exit_lmode(struct kvm_vcpu *vcpu)
782 {
783         vcpu->shadow_efer &= ~EFER_LMA;
784
785         vmcs_write32(VM_ENTRY_CONTROLS,
786                      vmcs_read32(VM_ENTRY_CONTROLS)
787                      & ~VM_ENTRY_CONTROLS_IA32E_MASK);
788 }
789
790 #endif
791
792 static void vmx_decache_cr0_cr4_guest_bits(struct kvm_vcpu *vcpu)
793 {
794         vcpu->cr0 &= KVM_GUEST_CR0_MASK;
795         vcpu->cr0 |= vmcs_readl(GUEST_CR0) & ~KVM_GUEST_CR0_MASK;
796
797         vcpu->cr4 &= KVM_GUEST_CR4_MASK;
798         vcpu->cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK;
799 }
800
801 static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
802 {
803         if (vcpu->rmode.active && (cr0 & CR0_PE_MASK))
804                 enter_pmode(vcpu);
805
806         if (!vcpu->rmode.active && !(cr0 & CR0_PE_MASK))
807                 enter_rmode(vcpu);
808
809 #ifdef CONFIG_X86_64
810         if (vcpu->shadow_efer & EFER_LME) {
811                 if (!is_paging(vcpu) && (cr0 & CR0_PG_MASK))
812                         enter_lmode(vcpu);
813                 if (is_paging(vcpu) && !(cr0 & CR0_PG_MASK))
814                         exit_lmode(vcpu);
815         }
816 #endif
817
818         vmcs_writel(CR0_READ_SHADOW, cr0);
819         vmcs_writel(GUEST_CR0,
820                     (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON);
821         vcpu->cr0 = cr0;
822 }
823
824 static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
825 {
826         vmcs_writel(GUEST_CR3, cr3);
827 }
828
829 static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
830 {
831         vmcs_writel(CR4_READ_SHADOW, cr4);
832         vmcs_writel(GUEST_CR4, cr4 | (vcpu->rmode.active ?
833                     KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON));
834         vcpu->cr4 = cr4;
835 }
836
837 #ifdef CONFIG_X86_64
838
839 static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
840 {
841         struct vmx_msr_entry *msr = find_msr_entry(vcpu, MSR_EFER);
842
843         vcpu->shadow_efer = efer;
844         if (efer & EFER_LMA) {
845                 vmcs_write32(VM_ENTRY_CONTROLS,
846                                      vmcs_read32(VM_ENTRY_CONTROLS) |
847                                      VM_ENTRY_CONTROLS_IA32E_MASK);
848                 msr->data = efer;
849
850         } else {
851                 vmcs_write32(VM_ENTRY_CONTROLS,
852                                      vmcs_read32(VM_ENTRY_CONTROLS) &
853                                      ~VM_ENTRY_CONTROLS_IA32E_MASK);
854
855                 msr->data = efer & ~EFER_LME;
856         }
857         setup_msrs(vcpu);
858 }
859
860 #endif
861
862 static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
863 {
864         struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
865
866         return vmcs_readl(sf->base);
867 }
868
869 static void vmx_get_segment(struct kvm_vcpu *vcpu,
870                             struct kvm_segment *var, int seg)
871 {
872         struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
873         u32 ar;
874
875         var->base = vmcs_readl(sf->base);
876         var->limit = vmcs_read32(sf->limit);
877         var->selector = vmcs_read16(sf->selector);
878         ar = vmcs_read32(sf->ar_bytes);
879         if (ar & AR_UNUSABLE_MASK)
880                 ar = 0;
881         var->type = ar & 15;
882         var->s = (ar >> 4) & 1;
883         var->dpl = (ar >> 5) & 3;
884         var->present = (ar >> 7) & 1;
885         var->avl = (ar >> 12) & 1;
886         var->l = (ar >> 13) & 1;
887         var->db = (ar >> 14) & 1;
888         var->g = (ar >> 15) & 1;
889         var->unusable = (ar >> 16) & 1;
890 }
891
892 static void vmx_set_segment(struct kvm_vcpu *vcpu,
893                             struct kvm_segment *var, int seg)
894 {
895         struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
896         u32 ar;
897
898         vmcs_writel(sf->base, var->base);
899         vmcs_write32(sf->limit, var->limit);
900         vmcs_write16(sf->selector, var->selector);
901         if (vcpu->rmode.active && var->s) {
902                 /*
903                  * Hack real-mode segments into vm86 compatibility.
904                  */
905                 if (var->base == 0xffff0000 && var->selector == 0xf000)
906                         vmcs_writel(sf->base, 0xf0000);
907                 ar = 0xf3;
908         } else if (var->unusable)
909                 ar = 1 << 16;
910         else {
911                 ar = var->type & 15;
912                 ar |= (var->s & 1) << 4;
913                 ar |= (var->dpl & 3) << 5;
914                 ar |= (var->present & 1) << 7;
915                 ar |= (var->avl & 1) << 12;
916                 ar |= (var->l & 1) << 13;
917                 ar |= (var->db & 1) << 14;
918                 ar |= (var->g & 1) << 15;
919         }
920         if (ar == 0) /* a 0 value means unusable */
921                 ar = AR_UNUSABLE_MASK;
922         vmcs_write32(sf->ar_bytes, ar);
923 }
924
925 static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
926 {
927         u32 ar = vmcs_read32(GUEST_CS_AR_BYTES);
928
929         *db = (ar >> 14) & 1;
930         *l = (ar >> 13) & 1;
931 }
932
933 static void vmx_get_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
934 {
935         dt->limit = vmcs_read32(GUEST_IDTR_LIMIT);
936         dt->base = vmcs_readl(GUEST_IDTR_BASE);
937 }
938
939 static void vmx_set_idt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
940 {
941         vmcs_write32(GUEST_IDTR_LIMIT, dt->limit);
942         vmcs_writel(GUEST_IDTR_BASE, dt->base);
943 }
944
945 static void vmx_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
946 {
947         dt->limit = vmcs_read32(GUEST_GDTR_LIMIT);
948         dt->base = vmcs_readl(GUEST_GDTR_BASE);
949 }
950
951 static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
952 {
953         vmcs_write32(GUEST_GDTR_LIMIT, dt->limit);
954         vmcs_writel(GUEST_GDTR_BASE, dt->base);
955 }
956
957 static int init_rmode_tss(struct kvm* kvm)
958 {
959         struct page *p1, *p2, *p3;
960         gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
961         char *page;
962
963         p1 = gfn_to_page(kvm, fn++);
964         p2 = gfn_to_page(kvm, fn++);
965         p3 = gfn_to_page(kvm, fn);
966
967         if (!p1 || !p2 || !p3) {
968                 kvm_printf(kvm,"%s: gfn_to_page failed\n", __FUNCTION__);
969                 return 0;
970         }
971
972         page = kmap_atomic(p1, KM_USER0);
973         memset(page, 0, PAGE_SIZE);
974         *(u16*)(page + 0x66) = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
975         kunmap_atomic(page, KM_USER0);
976
977         page = kmap_atomic(p2, KM_USER0);
978         memset(page, 0, PAGE_SIZE);
979         kunmap_atomic(page, KM_USER0);
980
981         page = kmap_atomic(p3, KM_USER0);
982         memset(page, 0, PAGE_SIZE);
983         *(page + RMODE_TSS_SIZE - 2 * PAGE_SIZE - 1) = ~0;
984         kunmap_atomic(page, KM_USER0);
985
986         return 1;
987 }
988
989 static void vmcs_write32_fixedbits(u32 msr, u32 vmcs_field, u32 val)
990 {
991         u32 msr_high, msr_low;
992
993         rdmsr(msr, msr_low, msr_high);
994
995         val &= msr_high;
996         val |= msr_low;
997         vmcs_write32(vmcs_field, val);
998 }
999
1000 static void seg_setup(int seg)
1001 {
1002         struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1003
1004         vmcs_write16(sf->selector, 0);
1005         vmcs_writel(sf->base, 0);
1006         vmcs_write32(sf->limit, 0xffff);
1007         vmcs_write32(sf->ar_bytes, 0x93);
1008 }
1009
1010 /*
1011  * Sets up the vmcs for emulated real mode.
1012  */
1013 static int vmx_vcpu_setup(struct kvm_vcpu *vcpu)
1014 {
1015         u32 host_sysenter_cs;
1016         u32 junk;
1017         unsigned long a;
1018         struct descriptor_table dt;
1019         int i;
1020         int ret = 0;
1021         extern asmlinkage void kvm_vmx_return(void);
1022
1023         if (!init_rmode_tss(vcpu->kvm)) {
1024                 ret = -ENOMEM;
1025                 goto out;
1026         }
1027
1028         memset(vcpu->regs, 0, sizeof(vcpu->regs));
1029         vcpu->regs[VCPU_REGS_RDX] = get_rdx_init_val();
1030         vcpu->cr8 = 0;
1031         vcpu->apic_base = 0xfee00000 |
1032                         /*for vcpu 0*/ MSR_IA32_APICBASE_BSP |
1033                         MSR_IA32_APICBASE_ENABLE;
1034
1035         fx_init(vcpu);
1036
1037         /*
1038          * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
1039          * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4.  Sigh.
1040          */
1041         vmcs_write16(GUEST_CS_SELECTOR, 0xf000);
1042         vmcs_writel(GUEST_CS_BASE, 0x000f0000);
1043         vmcs_write32(GUEST_CS_LIMIT, 0xffff);
1044         vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
1045
1046         seg_setup(VCPU_SREG_DS);
1047         seg_setup(VCPU_SREG_ES);
1048         seg_setup(VCPU_SREG_FS);
1049         seg_setup(VCPU_SREG_GS);
1050         seg_setup(VCPU_SREG_SS);
1051
1052         vmcs_write16(GUEST_TR_SELECTOR, 0);
1053         vmcs_writel(GUEST_TR_BASE, 0);
1054         vmcs_write32(GUEST_TR_LIMIT, 0xffff);
1055         vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1056
1057         vmcs_write16(GUEST_LDTR_SELECTOR, 0);
1058         vmcs_writel(GUEST_LDTR_BASE, 0);
1059         vmcs_write32(GUEST_LDTR_LIMIT, 0xffff);
1060         vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082);
1061
1062         vmcs_write32(GUEST_SYSENTER_CS, 0);
1063         vmcs_writel(GUEST_SYSENTER_ESP, 0);
1064         vmcs_writel(GUEST_SYSENTER_EIP, 0);
1065
1066         vmcs_writel(GUEST_RFLAGS, 0x02);
1067         vmcs_writel(GUEST_RIP, 0xfff0);
1068         vmcs_writel(GUEST_RSP, 0);
1069
1070         //todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0
1071         vmcs_writel(GUEST_DR7, 0x400);
1072
1073         vmcs_writel(GUEST_GDTR_BASE, 0);
1074         vmcs_write32(GUEST_GDTR_LIMIT, 0xffff);
1075
1076         vmcs_writel(GUEST_IDTR_BASE, 0);
1077         vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
1078
1079         vmcs_write32(GUEST_ACTIVITY_STATE, 0);
1080         vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
1081         vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
1082
1083         /* I/O */
1084         vmcs_write64(IO_BITMAP_A, 0);
1085         vmcs_write64(IO_BITMAP_B, 0);
1086
1087         guest_write_tsc(0);
1088
1089         vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
1090
1091         /* Special registers */
1092         vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
1093
1094         /* Control */
1095         vmcs_write32_fixedbits(MSR_IA32_VMX_PINBASED_CTLS,
1096                                PIN_BASED_VM_EXEC_CONTROL,
1097                                PIN_BASED_EXT_INTR_MASK   /* 20.6.1 */
1098                                | PIN_BASED_NMI_EXITING   /* 20.6.1 */
1099                         );
1100         vmcs_write32_fixedbits(MSR_IA32_VMX_PROCBASED_CTLS,
1101                                CPU_BASED_VM_EXEC_CONTROL,
1102                                CPU_BASED_HLT_EXITING         /* 20.6.2 */
1103                                | CPU_BASED_CR8_LOAD_EXITING    /* 20.6.2 */
1104                                | CPU_BASED_CR8_STORE_EXITING   /* 20.6.2 */
1105                                | CPU_BASED_UNCOND_IO_EXITING   /* 20.6.2 */
1106                                | CPU_BASED_MOV_DR_EXITING
1107                                | CPU_BASED_USE_TSC_OFFSETING   /* 21.3 */
1108                         );
1109
1110         vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR);
1111         vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
1112         vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
1113         vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
1114
1115         vmcs_writel(HOST_CR0, read_cr0());  /* 22.2.3 */
1116         vmcs_writel(HOST_CR4, read_cr4());  /* 22.2.3, 22.2.5 */
1117         vmcs_writel(HOST_CR3, read_cr3());  /* 22.2.3  FIXME: shadow tables */
1118
1119         vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
1120         vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
1121         vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
1122         vmcs_write16(HOST_FS_SELECTOR, read_fs());    /* 22.2.4 */
1123         vmcs_write16(HOST_GS_SELECTOR, read_gs());    /* 22.2.4 */
1124         vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS);  /* 22.2.4 */
1125 #ifdef CONFIG_X86_64
1126         rdmsrl(MSR_FS_BASE, a);
1127         vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
1128         rdmsrl(MSR_GS_BASE, a);
1129         vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
1130 #else
1131         vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
1132         vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
1133 #endif
1134
1135         vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8);  /* 22.2.4 */
1136
1137         get_idt(&dt);
1138         vmcs_writel(HOST_IDTR_BASE, dt.base);   /* 22.2.4 */
1139
1140
1141         vmcs_writel(HOST_RIP, (unsigned long)kvm_vmx_return); /* 22.2.5 */
1142
1143         rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk);
1144         vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs);
1145         rdmsrl(MSR_IA32_SYSENTER_ESP, a);
1146         vmcs_writel(HOST_IA32_SYSENTER_ESP, a);   /* 22.2.3 */
1147         rdmsrl(MSR_IA32_SYSENTER_EIP, a);
1148         vmcs_writel(HOST_IA32_SYSENTER_EIP, a);   /* 22.2.3 */
1149
1150         for (i = 0; i < NR_VMX_MSR; ++i) {
1151                 u32 index = vmx_msr_index[i];
1152                 u32 data_low, data_high;
1153                 u64 data;
1154                 int j = vcpu->nmsrs;
1155
1156                 if (rdmsr_safe(index, &data_low, &data_high) < 0)
1157                         continue;
1158                 if (wrmsr_safe(index, data_low, data_high) < 0)
1159                         continue;
1160                 data = data_low | ((u64)data_high << 32);
1161                 vcpu->host_msrs[j].index = index;
1162                 vcpu->host_msrs[j].reserved = 0;
1163                 vcpu->host_msrs[j].data = data;
1164                 vcpu->guest_msrs[j] = vcpu->host_msrs[j];
1165 #ifdef CONFIG_X86_64
1166                 if (index == MSR_KERNEL_GS_BASE)
1167                         msr_offset_kernel_gs_base = j;
1168 #endif
1169                 ++vcpu->nmsrs;
1170         }
1171
1172         setup_msrs(vcpu);
1173
1174         vmcs_write32_fixedbits(MSR_IA32_VMX_EXIT_CTLS, VM_EXIT_CONTROLS,
1175                                (HOST_IS_64 << 9));  /* 22.2,1, 20.7.1 */
1176
1177         /* 22.2.1, 20.8.1 */
1178         vmcs_write32_fixedbits(MSR_IA32_VMX_ENTRY_CTLS,
1179                                VM_ENTRY_CONTROLS, 0);
1180         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);  /* 22.2.1 */
1181
1182 #ifdef CONFIG_X86_64
1183         vmcs_writel(VIRTUAL_APIC_PAGE_ADDR, 0);
1184         vmcs_writel(TPR_THRESHOLD, 0);
1185 #endif
1186
1187         vmcs_writel(CR0_GUEST_HOST_MASK, KVM_GUEST_CR0_MASK);
1188         vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK);
1189
1190         vcpu->cr0 = 0x60000010;
1191         vmx_set_cr0(vcpu, vcpu->cr0); // enter rmode
1192         vmx_set_cr4(vcpu, 0);
1193 #ifdef CONFIG_X86_64
1194         vmx_set_efer(vcpu, 0);
1195 #endif
1196
1197         return 0;
1198
1199 out:
1200         return ret;
1201 }
1202
1203 static void inject_rmode_irq(struct kvm_vcpu *vcpu, int irq)
1204 {
1205         u16 ent[2];
1206         u16 cs;
1207         u16 ip;
1208         unsigned long flags;
1209         unsigned long ss_base = vmcs_readl(GUEST_SS_BASE);
1210         u16 sp =  vmcs_readl(GUEST_RSP);
1211         u32 ss_limit = vmcs_read32(GUEST_SS_LIMIT);
1212
1213         if (sp > ss_limit || sp < 6 ) {
1214                 vcpu_printf(vcpu, "%s: #SS, rsp 0x%lx ss 0x%lx limit 0x%x\n",
1215                             __FUNCTION__,
1216                             vmcs_readl(GUEST_RSP),
1217                             vmcs_readl(GUEST_SS_BASE),
1218                             vmcs_read32(GUEST_SS_LIMIT));
1219                 return;
1220         }
1221
1222         if (kvm_read_guest(vcpu, irq * sizeof(ent), sizeof(ent), &ent) !=
1223                                                                 sizeof(ent)) {
1224                 vcpu_printf(vcpu, "%s: read guest err\n", __FUNCTION__);
1225                 return;
1226         }
1227
1228         flags =  vmcs_readl(GUEST_RFLAGS);
1229         cs =  vmcs_readl(GUEST_CS_BASE) >> 4;
1230         ip =  vmcs_readl(GUEST_RIP);
1231
1232
1233         if (kvm_write_guest(vcpu, ss_base + sp - 2, 2, &flags) != 2 ||
1234             kvm_write_guest(vcpu, ss_base + sp - 4, 2, &cs) != 2 ||
1235             kvm_write_guest(vcpu, ss_base + sp - 6, 2, &ip) != 2) {
1236                 vcpu_printf(vcpu, "%s: write guest err\n", __FUNCTION__);
1237                 return;
1238         }
1239
1240         vmcs_writel(GUEST_RFLAGS, flags &
1241                     ~( X86_EFLAGS_IF | X86_EFLAGS_AC | X86_EFLAGS_TF));
1242         vmcs_write16(GUEST_CS_SELECTOR, ent[1]) ;
1243         vmcs_writel(GUEST_CS_BASE, ent[1] << 4);
1244         vmcs_writel(GUEST_RIP, ent[0]);
1245         vmcs_writel(GUEST_RSP, (vmcs_readl(GUEST_RSP) & ~0xffff) | (sp - 6));
1246 }
1247
1248 static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
1249 {
1250         int word_index = __ffs(vcpu->irq_summary);
1251         int bit_index = __ffs(vcpu->irq_pending[word_index]);
1252         int irq = word_index * BITS_PER_LONG + bit_index;
1253
1254         clear_bit(bit_index, &vcpu->irq_pending[word_index]);
1255         if (!vcpu->irq_pending[word_index])
1256                 clear_bit(word_index, &vcpu->irq_summary);
1257
1258         if (vcpu->rmode.active) {
1259                 inject_rmode_irq(vcpu, irq);
1260                 return;
1261         }
1262         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
1263                         irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
1264 }
1265
1266
1267 static void do_interrupt_requests(struct kvm_vcpu *vcpu,
1268                                        struct kvm_run *kvm_run)
1269 {
1270         u32 cpu_based_vm_exec_control;
1271
1272         vcpu->interrupt_window_open =
1273                 ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
1274                  (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
1275
1276         if (vcpu->interrupt_window_open &&
1277             vcpu->irq_summary &&
1278             !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
1279                 /*
1280                  * If interrupts enabled, and not blocked by sti or mov ss. Good.
1281                  */
1282                 kvm_do_inject_irq(vcpu);
1283
1284         cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
1285         if (!vcpu->interrupt_window_open &&
1286             (vcpu->irq_summary || kvm_run->request_interrupt_window))
1287                 /*
1288                  * Interrupts blocked.  Wait for unblock.
1289                  */
1290                 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
1291         else
1292                 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
1293         vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
1294 }
1295
1296 static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
1297 {
1298         struct kvm_guest_debug *dbg = &vcpu->guest_debug;
1299
1300         set_debugreg(dbg->bp[0], 0);
1301         set_debugreg(dbg->bp[1], 1);
1302         set_debugreg(dbg->bp[2], 2);
1303         set_debugreg(dbg->bp[3], 3);
1304
1305         if (dbg->singlestep) {
1306                 unsigned long flags;
1307
1308                 flags = vmcs_readl(GUEST_RFLAGS);
1309                 flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
1310                 vmcs_writel(GUEST_RFLAGS, flags);
1311         }
1312 }
1313
1314 static int handle_rmode_exception(struct kvm_vcpu *vcpu,
1315                                   int vec, u32 err_code)
1316 {
1317         if (!vcpu->rmode.active)
1318                 return 0;
1319
1320         if (vec == GP_VECTOR && err_code == 0)
1321                 if (emulate_instruction(vcpu, NULL, 0, 0) == EMULATE_DONE)
1322                         return 1;
1323         return 0;
1324 }
1325
1326 static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1327 {
1328         u32 intr_info, error_code;
1329         unsigned long cr2, rip;
1330         u32 vect_info;
1331         enum emulation_result er;
1332         int r;
1333
1334         vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
1335         intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
1336
1337         if ((vect_info & VECTORING_INFO_VALID_MASK) &&
1338                                                 !is_page_fault(intr_info)) {
1339                 printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
1340                        "intr info 0x%x\n", __FUNCTION__, vect_info, intr_info);
1341         }
1342
1343         if (is_external_interrupt(vect_info)) {
1344                 int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
1345                 set_bit(irq, vcpu->irq_pending);
1346                 set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary);
1347         }
1348
1349         if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) { /* nmi */
1350                 asm ("int $2");
1351                 return 1;
1352         }
1353         error_code = 0;
1354         rip = vmcs_readl(GUEST_RIP);
1355         if (intr_info & INTR_INFO_DELIEVER_CODE_MASK)
1356                 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
1357         if (is_page_fault(intr_info)) {
1358                 cr2 = vmcs_readl(EXIT_QUALIFICATION);
1359
1360                 spin_lock(&vcpu->kvm->lock);
1361                 r = kvm_mmu_page_fault(vcpu, cr2, error_code);
1362                 if (r < 0) {
1363                         spin_unlock(&vcpu->kvm->lock);
1364                         return r;
1365                 }
1366                 if (!r) {
1367                         spin_unlock(&vcpu->kvm->lock);
1368                         return 1;
1369                 }
1370
1371                 er = emulate_instruction(vcpu, kvm_run, cr2, error_code);
1372                 spin_unlock(&vcpu->kvm->lock);
1373
1374                 switch (er) {
1375                 case EMULATE_DONE:
1376                         return 1;
1377                 case EMULATE_DO_MMIO:
1378                         ++kvm_stat.mmio_exits;
1379                         kvm_run->exit_reason = KVM_EXIT_MMIO;
1380                         return 0;
1381                  case EMULATE_FAIL:
1382                         vcpu_printf(vcpu, "%s: emulate fail\n", __FUNCTION__);
1383                         break;
1384                 default:
1385                         BUG();
1386                 }
1387         }
1388
1389         if (vcpu->rmode.active &&
1390             handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
1391                                                                 error_code))
1392                 return 1;
1393
1394         if ((intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK)) == (INTR_TYPE_EXCEPTION | 1)) {
1395                 kvm_run->exit_reason = KVM_EXIT_DEBUG;
1396                 return 0;
1397         }
1398         kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
1399         kvm_run->ex.exception = intr_info & INTR_INFO_VECTOR_MASK;
1400         kvm_run->ex.error_code = error_code;
1401         return 0;
1402 }
1403
1404 static int handle_external_interrupt(struct kvm_vcpu *vcpu,
1405                                      struct kvm_run *kvm_run)
1406 {
1407         ++kvm_stat.irq_exits;
1408         return 1;
1409 }
1410
1411 static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1412 {
1413         kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
1414         return 0;
1415 }
1416
1417 static int get_io_count(struct kvm_vcpu *vcpu, unsigned long *count)
1418 {
1419         u64 inst;
1420         gva_t rip;
1421         int countr_size;
1422         int i, n;
1423
1424         if ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_VM)) {
1425                 countr_size = 2;
1426         } else {
1427                 u32 cs_ar = vmcs_read32(GUEST_CS_AR_BYTES);
1428
1429                 countr_size = (cs_ar & AR_L_MASK) ? 8:
1430                               (cs_ar & AR_DB_MASK) ? 4: 2;
1431         }
1432
1433         rip =  vmcs_readl(GUEST_RIP);
1434         if (countr_size != 8)
1435                 rip += vmcs_readl(GUEST_CS_BASE);
1436
1437         n = kvm_read_guest(vcpu, rip, sizeof(inst), &inst);
1438
1439         for (i = 0; i < n; i++) {
1440                 switch (((u8*)&inst)[i]) {
1441                 case 0xf0:
1442                 case 0xf2:
1443                 case 0xf3:
1444                 case 0x2e:
1445                 case 0x36:
1446                 case 0x3e:
1447                 case 0x26:
1448                 case 0x64:
1449                 case 0x65:
1450                 case 0x66:
1451                         break;
1452                 case 0x67:
1453                         countr_size = (countr_size == 2) ? 4: (countr_size >> 1);
1454                 default:
1455                         goto done;
1456                 }
1457         }
1458         return 0;
1459 done:
1460         countr_size *= 8;
1461         *count = vcpu->regs[VCPU_REGS_RCX] & (~0ULL >> (64 - countr_size));
1462         //printk("cx: %lx\n", vcpu->regs[VCPU_REGS_RCX]);
1463         return 1;
1464 }
1465
1466 static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1467 {
1468         u64 exit_qualification;
1469         int size, down, in, string, rep;
1470         unsigned port;
1471         unsigned long count;
1472         gva_t address;
1473
1474         ++kvm_stat.io_exits;
1475         exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
1476         in = (exit_qualification & 8) != 0;
1477         size = (exit_qualification & 7) + 1;
1478         string = (exit_qualification & 16) != 0;
1479         down = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_DF) != 0;
1480         count = 1;
1481         rep = (exit_qualification & 32) != 0;
1482         port = exit_qualification >> 16;
1483         address = 0;
1484         if (string) {
1485                 if (rep && !get_io_count(vcpu, &count))
1486                         return 1;
1487                 address = vmcs_readl(GUEST_LINEAR_ADDRESS);
1488         }
1489         return kvm_setup_pio(vcpu, kvm_run, in, size, count, string, down,
1490                              address, rep, port);
1491 }
1492
1493 static void
1494 vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
1495 {
1496         /*
1497          * Patch in the VMCALL instruction:
1498          */
1499         hypercall[0] = 0x0f;
1500         hypercall[1] = 0x01;
1501         hypercall[2] = 0xc1;
1502         hypercall[3] = 0xc3;
1503 }
1504
1505 static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1506 {
1507         u64 exit_qualification;
1508         int cr;
1509         int reg;
1510
1511         exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
1512         cr = exit_qualification & 15;
1513         reg = (exit_qualification >> 8) & 15;
1514         switch ((exit_qualification >> 4) & 3) {
1515         case 0: /* mov to cr */
1516                 switch (cr) {
1517                 case 0:
1518                         vcpu_load_rsp_rip(vcpu);
1519                         set_cr0(vcpu, vcpu->regs[reg]);
1520                         skip_emulated_instruction(vcpu);
1521                         return 1;
1522                 case 3:
1523                         vcpu_load_rsp_rip(vcpu);
1524                         set_cr3(vcpu, vcpu->regs[reg]);
1525                         skip_emulated_instruction(vcpu);
1526                         return 1;
1527                 case 4:
1528                         vcpu_load_rsp_rip(vcpu);
1529                         set_cr4(vcpu, vcpu->regs[reg]);
1530                         skip_emulated_instruction(vcpu);
1531                         return 1;
1532                 case 8:
1533                         vcpu_load_rsp_rip(vcpu);
1534                         set_cr8(vcpu, vcpu->regs[reg]);
1535                         skip_emulated_instruction(vcpu);
1536                         return 1;
1537                 };
1538                 break;
1539         case 1: /*mov from cr*/
1540                 switch (cr) {
1541                 case 3:
1542                         vcpu_load_rsp_rip(vcpu);
1543                         vcpu->regs[reg] = vcpu->cr3;
1544                         vcpu_put_rsp_rip(vcpu);
1545                         skip_emulated_instruction(vcpu);
1546                         return 1;
1547                 case 8:
1548                         printk(KERN_DEBUG "handle_cr: read CR8 "
1549                                "cpu erratum AA15\n");
1550                         vcpu_load_rsp_rip(vcpu);
1551                         vcpu->regs[reg] = vcpu->cr8;
1552                         vcpu_put_rsp_rip(vcpu);
1553                         skip_emulated_instruction(vcpu);
1554                         return 1;
1555                 }
1556                 break;
1557         case 3: /* lmsw */
1558                 lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f);
1559
1560                 skip_emulated_instruction(vcpu);
1561                 return 1;
1562         default:
1563                 break;
1564         }
1565         kvm_run->exit_reason = 0;
1566         printk(KERN_ERR "kvm: unhandled control register: op %d cr %d\n",
1567                (int)(exit_qualification >> 4) & 3, cr);
1568         return 0;
1569 }
1570
1571 static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1572 {
1573         u64 exit_qualification;
1574         unsigned long val;
1575         int dr, reg;
1576
1577         /*
1578          * FIXME: this code assumes the host is debugging the guest.
1579          *        need to deal with guest debugging itself too.
1580          */
1581         exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
1582         dr = exit_qualification & 7;
1583         reg = (exit_qualification >> 8) & 15;
1584         vcpu_load_rsp_rip(vcpu);
1585         if (exit_qualification & 16) {
1586                 /* mov from dr */
1587                 switch (dr) {
1588                 case 6:
1589                         val = 0xffff0ff0;
1590                         break;
1591                 case 7:
1592                         val = 0x400;
1593                         break;
1594                 default:
1595                         val = 0;
1596                 }
1597                 vcpu->regs[reg] = val;
1598         } else {
1599                 /* mov to dr */
1600         }
1601         vcpu_put_rsp_rip(vcpu);
1602         skip_emulated_instruction(vcpu);
1603         return 1;
1604 }
1605
1606 static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1607 {
1608         kvm_emulate_cpuid(vcpu);
1609         return 1;
1610 }
1611
1612 static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1613 {
1614         u32 ecx = vcpu->regs[VCPU_REGS_RCX];
1615         u64 data;
1616
1617         if (vmx_get_msr(vcpu, ecx, &data)) {
1618                 vmx_inject_gp(vcpu, 0);
1619                 return 1;
1620         }
1621
1622         /* FIXME: handling of bits 32:63 of rax, rdx */
1623         vcpu->regs[VCPU_REGS_RAX] = data & -1u;
1624         vcpu->regs[VCPU_REGS_RDX] = (data >> 32) & -1u;
1625         skip_emulated_instruction(vcpu);
1626         return 1;
1627 }
1628
1629 static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1630 {
1631         u32 ecx = vcpu->regs[VCPU_REGS_RCX];
1632         u64 data = (vcpu->regs[VCPU_REGS_RAX] & -1u)
1633                 | ((u64)(vcpu->regs[VCPU_REGS_RDX] & -1u) << 32);
1634
1635         if (vmx_set_msr(vcpu, ecx, data) != 0) {
1636                 vmx_inject_gp(vcpu, 0);
1637                 return 1;
1638         }
1639
1640         skip_emulated_instruction(vcpu);
1641         return 1;
1642 }
1643
1644 static void post_kvm_run_save(struct kvm_vcpu *vcpu,
1645                               struct kvm_run *kvm_run)
1646 {
1647         kvm_run->if_flag = (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) != 0;
1648         kvm_run->cr8 = vcpu->cr8;
1649         kvm_run->apic_base = vcpu->apic_base;
1650         kvm_run->ready_for_interrupt_injection = (vcpu->interrupt_window_open &&
1651                                                   vcpu->irq_summary == 0);
1652 }
1653
1654 static int handle_interrupt_window(struct kvm_vcpu *vcpu,
1655                                    struct kvm_run *kvm_run)
1656 {
1657         /*
1658          * If the user space waits to inject interrupts, exit as soon as
1659          * possible
1660          */
1661         if (kvm_run->request_interrupt_window &&
1662             !vcpu->irq_summary) {
1663                 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
1664                 ++kvm_stat.irq_window_exits;
1665                 return 0;
1666         }
1667         return 1;
1668 }
1669
1670 static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1671 {
1672         skip_emulated_instruction(vcpu);
1673         if (vcpu->irq_summary)
1674                 return 1;
1675
1676         kvm_run->exit_reason = KVM_EXIT_HLT;
1677         ++kvm_stat.halt_exits;
1678         return 0;
1679 }
1680
1681 static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1682 {
1683         skip_emulated_instruction(vcpu);
1684         return kvm_hypercall(vcpu, kvm_run);
1685 }
1686
1687 /*
1688  * The exit handlers return 1 if the exit was handled fully and guest execution
1689  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
1690  * to be done to userspace and return 0.
1691  */
1692 static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
1693                                       struct kvm_run *kvm_run) = {
1694         [EXIT_REASON_EXCEPTION_NMI]           = handle_exception,
1695         [EXIT_REASON_EXTERNAL_INTERRUPT]      = handle_external_interrupt,
1696         [EXIT_REASON_TRIPLE_FAULT]            = handle_triple_fault,
1697         [EXIT_REASON_IO_INSTRUCTION]          = handle_io,
1698         [EXIT_REASON_CR_ACCESS]               = handle_cr,
1699         [EXIT_REASON_DR_ACCESS]               = handle_dr,
1700         [EXIT_REASON_CPUID]                   = handle_cpuid,
1701         [EXIT_REASON_MSR_READ]                = handle_rdmsr,
1702         [EXIT_REASON_MSR_WRITE]               = handle_wrmsr,
1703         [EXIT_REASON_PENDING_INTERRUPT]       = handle_interrupt_window,
1704         [EXIT_REASON_HLT]                     = handle_halt,
1705         [EXIT_REASON_VMCALL]                  = handle_vmcall,
1706 };
1707
1708 static const int kvm_vmx_max_exit_handlers =
1709         sizeof(kvm_vmx_exit_handlers) / sizeof(*kvm_vmx_exit_handlers);
1710
1711 /*
1712  * The guest has exited.  See if we can fix it or if we need userspace
1713  * assistance.
1714  */
1715 static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1716 {
1717         u32 vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
1718         u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
1719
1720         if ( (vectoring_info & VECTORING_INFO_VALID_MASK) &&
1721                                 exit_reason != EXIT_REASON_EXCEPTION_NMI )
1722                 printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
1723                        "exit reason is 0x%x\n", __FUNCTION__, exit_reason);
1724         kvm_run->instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
1725         if (exit_reason < kvm_vmx_max_exit_handlers
1726             && kvm_vmx_exit_handlers[exit_reason])
1727                 return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run);
1728         else {
1729                 kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
1730                 kvm_run->hw.hardware_exit_reason = exit_reason;
1731         }
1732         return 0;
1733 }
1734
1735 /*
1736  * Check if userspace requested an interrupt window, and that the
1737  * interrupt window is open.
1738  *
1739  * No need to exit to userspace if we already have an interrupt queued.
1740  */
1741 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
1742                                           struct kvm_run *kvm_run)
1743 {
1744         return (!vcpu->irq_summary &&
1745                 kvm_run->request_interrupt_window &&
1746                 vcpu->interrupt_window_open &&
1747                 (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF));
1748 }
1749
1750 static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1751 {
1752         u8 fail;
1753         u16 fs_sel, gs_sel, ldt_sel;
1754         int fs_gs_ldt_reload_needed;
1755         int r;
1756
1757 again:
1758         /*
1759          * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
1760          * allow segment selectors with cpl > 0 or ti == 1.
1761          */
1762         fs_sel = read_fs();
1763         gs_sel = read_gs();
1764         ldt_sel = read_ldt();
1765         fs_gs_ldt_reload_needed = (fs_sel & 7) | (gs_sel & 7) | ldt_sel;
1766         if (!fs_gs_ldt_reload_needed) {
1767                 vmcs_write16(HOST_FS_SELECTOR, fs_sel);
1768                 vmcs_write16(HOST_GS_SELECTOR, gs_sel);
1769         } else {
1770                 vmcs_write16(HOST_FS_SELECTOR, 0);
1771                 vmcs_write16(HOST_GS_SELECTOR, 0);
1772         }
1773
1774 #ifdef CONFIG_X86_64
1775         vmcs_writel(HOST_FS_BASE, read_msr(MSR_FS_BASE));
1776         vmcs_writel(HOST_GS_BASE, read_msr(MSR_GS_BASE));
1777 #else
1778         vmcs_writel(HOST_FS_BASE, segment_base(fs_sel));
1779         vmcs_writel(HOST_GS_BASE, segment_base(gs_sel));
1780 #endif
1781
1782         if (!vcpu->mmio_read_completed)
1783                 do_interrupt_requests(vcpu, kvm_run);
1784
1785         if (vcpu->guest_debug.enabled)
1786                 kvm_guest_debug_pre(vcpu);
1787
1788         fx_save(vcpu->host_fx_image);
1789         fx_restore(vcpu->guest_fx_image);
1790
1791 #ifdef CONFIG_X86_64
1792         if (is_long_mode(vcpu)) {
1793                 save_msrs(vcpu->host_msrs + msr_offset_kernel_gs_base, 1);
1794                 load_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
1795         }
1796 #endif
1797
1798         asm (
1799                 /* Store host registers */
1800                 "pushf \n\t"
1801 #ifdef CONFIG_X86_64
1802                 "push %%rax; push %%rbx; push %%rdx;"
1803                 "push %%rsi; push %%rdi; push %%rbp;"
1804                 "push %%r8;  push %%r9;  push %%r10; push %%r11;"
1805                 "push %%r12; push %%r13; push %%r14; push %%r15;"
1806                 "push %%rcx \n\t"
1807                 ASM_VMX_VMWRITE_RSP_RDX "\n\t"
1808 #else
1809                 "pusha; push %%ecx \n\t"
1810                 ASM_VMX_VMWRITE_RSP_RDX "\n\t"
1811 #endif
1812                 /* Check if vmlaunch of vmresume is needed */
1813                 "cmp $0, %1 \n\t"
1814                 /* Load guest registers.  Don't clobber flags. */
1815 #ifdef CONFIG_X86_64
1816                 "mov %c[cr2](%3), %%rax \n\t"
1817                 "mov %%rax, %%cr2 \n\t"
1818                 "mov %c[rax](%3), %%rax \n\t"
1819                 "mov %c[rbx](%3), %%rbx \n\t"
1820                 "mov %c[rdx](%3), %%rdx \n\t"
1821                 "mov %c[rsi](%3), %%rsi \n\t"
1822                 "mov %c[rdi](%3), %%rdi \n\t"
1823                 "mov %c[rbp](%3), %%rbp \n\t"
1824                 "mov %c[r8](%3),  %%r8  \n\t"
1825                 "mov %c[r9](%3),  %%r9  \n\t"
1826                 "mov %c[r10](%3), %%r10 \n\t"
1827                 "mov %c[r11](%3), %%r11 \n\t"
1828                 "mov %c[r12](%3), %%r12 \n\t"
1829                 "mov %c[r13](%3), %%r13 \n\t"
1830                 "mov %c[r14](%3), %%r14 \n\t"
1831                 "mov %c[r15](%3), %%r15 \n\t"
1832                 "mov %c[rcx](%3), %%rcx \n\t" /* kills %3 (rcx) */
1833 #else
1834                 "mov %c[cr2](%3), %%eax \n\t"
1835                 "mov %%eax,   %%cr2 \n\t"
1836                 "mov %c[rax](%3), %%eax \n\t"
1837                 "mov %c[rbx](%3), %%ebx \n\t"
1838                 "mov %c[rdx](%3), %%edx \n\t"
1839                 "mov %c[rsi](%3), %%esi \n\t"
1840                 "mov %c[rdi](%3), %%edi \n\t"
1841                 "mov %c[rbp](%3), %%ebp \n\t"
1842                 "mov %c[rcx](%3), %%ecx \n\t" /* kills %3 (ecx) */
1843 #endif
1844                 /* Enter guest mode */
1845                 "jne launched \n\t"
1846                 ASM_VMX_VMLAUNCH "\n\t"
1847                 "jmp kvm_vmx_return \n\t"
1848                 "launched: " ASM_VMX_VMRESUME "\n\t"
1849                 ".globl kvm_vmx_return \n\t"
1850                 "kvm_vmx_return: "
1851                 /* Save guest registers, load host registers, keep flags */
1852 #ifdef CONFIG_X86_64
1853                 "xchg %3,     (%%rsp) \n\t"
1854                 "mov %%rax, %c[rax](%3) \n\t"
1855                 "mov %%rbx, %c[rbx](%3) \n\t"
1856                 "pushq (%%rsp); popq %c[rcx](%3) \n\t"
1857                 "mov %%rdx, %c[rdx](%3) \n\t"
1858                 "mov %%rsi, %c[rsi](%3) \n\t"
1859                 "mov %%rdi, %c[rdi](%3) \n\t"
1860                 "mov %%rbp, %c[rbp](%3) \n\t"
1861                 "mov %%r8,  %c[r8](%3) \n\t"
1862                 "mov %%r9,  %c[r9](%3) \n\t"
1863                 "mov %%r10, %c[r10](%3) \n\t"
1864                 "mov %%r11, %c[r11](%3) \n\t"
1865                 "mov %%r12, %c[r12](%3) \n\t"
1866                 "mov %%r13, %c[r13](%3) \n\t"
1867                 "mov %%r14, %c[r14](%3) \n\t"
1868                 "mov %%r15, %c[r15](%3) \n\t"
1869                 "mov %%cr2, %%rax   \n\t"
1870                 "mov %%rax, %c[cr2](%3) \n\t"
1871                 "mov (%%rsp), %3 \n\t"
1872
1873                 "pop  %%rcx; pop  %%r15; pop  %%r14; pop  %%r13; pop  %%r12;"
1874                 "pop  %%r11; pop  %%r10; pop  %%r9;  pop  %%r8;"
1875                 "pop  %%rbp; pop  %%rdi; pop  %%rsi;"
1876                 "pop  %%rdx; pop  %%rbx; pop  %%rax \n\t"
1877 #else
1878                 "xchg %3, (%%esp) \n\t"
1879                 "mov %%eax, %c[rax](%3) \n\t"
1880                 "mov %%ebx, %c[rbx](%3) \n\t"
1881                 "pushl (%%esp); popl %c[rcx](%3) \n\t"
1882                 "mov %%edx, %c[rdx](%3) \n\t"
1883                 "mov %%esi, %c[rsi](%3) \n\t"
1884                 "mov %%edi, %c[rdi](%3) \n\t"
1885                 "mov %%ebp, %c[rbp](%3) \n\t"
1886                 "mov %%cr2, %%eax  \n\t"
1887                 "mov %%eax, %c[cr2](%3) \n\t"
1888                 "mov (%%esp), %3 \n\t"
1889
1890                 "pop %%ecx; popa \n\t"
1891 #endif
1892                 "setbe %0 \n\t"
1893                 "popf \n\t"
1894               : "=q" (fail)
1895               : "r"(vcpu->launched), "d"((unsigned long)HOST_RSP),
1896                 "c"(vcpu),
1897                 [rax]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RAX])),
1898                 [rbx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBX])),
1899                 [rcx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RCX])),
1900                 [rdx]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDX])),
1901                 [rsi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RSI])),
1902                 [rdi]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RDI])),
1903                 [rbp]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_RBP])),
1904 #ifdef CONFIG_X86_64
1905                 [r8 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R8 ])),
1906                 [r9 ]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R9 ])),
1907                 [r10]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R10])),
1908                 [r11]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R11])),
1909                 [r12]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R12])),
1910                 [r13]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R13])),
1911                 [r14]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R14])),
1912                 [r15]"i"(offsetof(struct kvm_vcpu, regs[VCPU_REGS_R15])),
1913 #endif
1914                 [cr2]"i"(offsetof(struct kvm_vcpu, cr2))
1915               : "cc", "memory" );
1916
1917         /*
1918          * Reload segment selectors ASAP. (it's needed for a functional
1919          * kernel: x86 relies on having __KERNEL_PDA in %fs and x86_64
1920          * relies on having 0 in %gs for the CPU PDA to work.)
1921          */
1922         if (fs_gs_ldt_reload_needed) {
1923                 load_ldt(ldt_sel);
1924                 load_fs(fs_sel);
1925                 /*
1926                  * If we have to reload gs, we must take care to
1927                  * preserve our gs base.
1928                  */
1929                 local_irq_disable();
1930                 load_gs(gs_sel);
1931 #ifdef CONFIG_X86_64
1932                 wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
1933 #endif
1934                 local_irq_enable();
1935
1936                 reload_tss();
1937         }
1938         ++kvm_stat.exits;
1939
1940 #ifdef CONFIG_X86_64
1941         if (is_long_mode(vcpu)) {
1942                 save_msrs(vcpu->guest_msrs, NR_BAD_MSRS);
1943                 load_msrs(vcpu->host_msrs, NR_BAD_MSRS);
1944         }
1945 #endif
1946
1947         fx_save(vcpu->guest_fx_image);
1948         fx_restore(vcpu->host_fx_image);
1949         vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0;
1950
1951         asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
1952
1953         if (fail) {
1954                 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
1955                 kvm_run->fail_entry.hardware_entry_failure_reason
1956                         = vmcs_read32(VM_INSTRUCTION_ERROR);
1957                 r = 0;
1958         } else {
1959                 /*
1960                  * Profile KVM exit RIPs:
1961                  */
1962                 if (unlikely(prof_on == KVM_PROFILING))
1963                         profile_hit(KVM_PROFILING, (void *)vmcs_readl(GUEST_RIP));
1964
1965                 vcpu->launched = 1;
1966                 r = kvm_handle_exit(kvm_run, vcpu);
1967                 if (r > 0) {
1968                         /* Give scheduler a change to reschedule. */
1969                         if (signal_pending(current)) {
1970                                 ++kvm_stat.signal_exits;
1971                                 post_kvm_run_save(vcpu, kvm_run);
1972                                 kvm_run->exit_reason = KVM_EXIT_INTR;
1973                                 return -EINTR;
1974                         }
1975
1976                         if (dm_request_for_irq_injection(vcpu, kvm_run)) {
1977                                 ++kvm_stat.request_irq_exits;
1978                                 post_kvm_run_save(vcpu, kvm_run);
1979                                 kvm_run->exit_reason = KVM_EXIT_INTR;
1980                                 return -EINTR;
1981                         }
1982
1983                         kvm_resched(vcpu);
1984                         goto again;
1985                 }
1986         }
1987
1988         post_kvm_run_save(vcpu, kvm_run);
1989         return r;
1990 }
1991
1992 static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
1993 {
1994         vmcs_writel(GUEST_CR3, vmcs_readl(GUEST_CR3));
1995 }
1996
1997 static void vmx_inject_page_fault(struct kvm_vcpu *vcpu,
1998                                   unsigned long addr,
1999                                   u32 err_code)
2000 {
2001         u32 vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
2002
2003         ++kvm_stat.pf_guest;
2004
2005         if (is_page_fault(vect_info)) {
2006                 printk(KERN_DEBUG "inject_page_fault: "
2007                        "double fault 0x%lx @ 0x%lx\n",
2008                        addr, vmcs_readl(GUEST_RIP));
2009                 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 0);
2010                 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2011                              DF_VECTOR |
2012                              INTR_TYPE_EXCEPTION |
2013                              INTR_INFO_DELIEVER_CODE_MASK |
2014                              INTR_INFO_VALID_MASK);
2015                 return;
2016         }
2017         vcpu->cr2 = addr;
2018         vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, err_code);
2019         vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2020                      PF_VECTOR |
2021                      INTR_TYPE_EXCEPTION |
2022                      INTR_INFO_DELIEVER_CODE_MASK |
2023                      INTR_INFO_VALID_MASK);
2024
2025 }
2026
2027 static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
2028 {
2029         if (vcpu->vmcs) {
2030                 on_each_cpu(__vcpu_clear, vcpu, 0, 1);
2031                 free_vmcs(vcpu->vmcs);
2032                 vcpu->vmcs = NULL;
2033         }
2034 }
2035
2036 static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
2037 {
2038         vmx_free_vmcs(vcpu);
2039 }
2040
2041 static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
2042 {
2043         struct vmcs *vmcs;
2044
2045         vcpu->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
2046         if (!vcpu->guest_msrs)
2047                 return -ENOMEM;
2048
2049         vcpu->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
2050         if (!vcpu->host_msrs)
2051                 goto out_free_guest_msrs;
2052
2053         vmcs = alloc_vmcs();
2054         if (!vmcs)
2055                 goto out_free_msrs;
2056
2057         vmcs_clear(vmcs);
2058         vcpu->vmcs = vmcs;
2059         vcpu->launched = 0;
2060
2061         return 0;
2062
2063 out_free_msrs:
2064         kfree(vcpu->host_msrs);
2065         vcpu->host_msrs = NULL;
2066
2067 out_free_guest_msrs:
2068         kfree(vcpu->guest_msrs);
2069         vcpu->guest_msrs = NULL;
2070
2071         return -ENOMEM;
2072 }
2073
2074 static struct kvm_arch_ops vmx_arch_ops = {
2075         .cpu_has_kvm_support = cpu_has_kvm_support,
2076         .disabled_by_bios = vmx_disabled_by_bios,
2077         .hardware_setup = hardware_setup,
2078         .hardware_unsetup = hardware_unsetup,
2079         .hardware_enable = hardware_enable,
2080         .hardware_disable = hardware_disable,
2081
2082         .vcpu_create = vmx_create_vcpu,
2083         .vcpu_free = vmx_free_vcpu,
2084
2085         .vcpu_load = vmx_vcpu_load,
2086         .vcpu_put = vmx_vcpu_put,
2087         .vcpu_decache = vmx_vcpu_decache,
2088
2089         .set_guest_debug = set_guest_debug,
2090         .get_msr = vmx_get_msr,
2091         .set_msr = vmx_set_msr,
2092         .get_segment_base = vmx_get_segment_base,
2093         .get_segment = vmx_get_segment,
2094         .set_segment = vmx_set_segment,
2095         .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
2096         .decache_cr0_cr4_guest_bits = vmx_decache_cr0_cr4_guest_bits,
2097         .set_cr0 = vmx_set_cr0,
2098         .set_cr3 = vmx_set_cr3,
2099         .set_cr4 = vmx_set_cr4,
2100 #ifdef CONFIG_X86_64
2101         .set_efer = vmx_set_efer,
2102 #endif
2103         .get_idt = vmx_get_idt,
2104         .set_idt = vmx_set_idt,
2105         .get_gdt = vmx_get_gdt,
2106         .set_gdt = vmx_set_gdt,
2107         .cache_regs = vcpu_load_rsp_rip,
2108         .decache_regs = vcpu_put_rsp_rip,
2109         .get_rflags = vmx_get_rflags,
2110         .set_rflags = vmx_set_rflags,
2111
2112         .tlb_flush = vmx_flush_tlb,
2113         .inject_page_fault = vmx_inject_page_fault,
2114
2115         .inject_gp = vmx_inject_gp,
2116
2117         .run = vmx_vcpu_run,
2118         .skip_emulated_instruction = skip_emulated_instruction,
2119         .vcpu_setup = vmx_vcpu_setup,
2120         .patch_hypercall = vmx_patch_hypercall,
2121 };
2122
2123 static int __init vmx_init(void)
2124 {
2125         return kvm_init_arch(&vmx_arch_ops, THIS_MODULE);
2126 }
2127
2128 static void __exit vmx_exit(void)
2129 {
2130         kvm_exit_arch();
2131 }
2132
2133 module_init(vmx_init)
2134 module_exit(vmx_exit)