2 * Kernel-based Virtual Machine driver for Linux
4 * This module enables machines with Intel VT-x extensions to run virtual
5 * machines without emulation or binary translation.
9 * Copyright (C) 2006 Qumranet, Inc.
12 * Yaniv Kamay <yaniv@qumranet.com>
13 * Avi Kivity <avi@qumranet.com>
15 * This work is licensed under the terms of the GNU GPL, version 2. See
16 * the COPYING file in the top-level directory.
19 #include <linux/types.h>
20 #include <linux/string.h>
23 #include <linux/highmem.h>
24 #include <linux/module.h>
34 static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
36 static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
41 #define pgprintk(x...) do { if (dbg) printk(x); } while (0)
42 #define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
46 #define pgprintk(x...) do { } while (0)
47 #define rmap_printk(x...) do { } while (0)
51 #if defined(MMU_DEBUG) || defined(AUDIT)
57 printk(KERN_WARNING "assertion failed %s:%d: %s\n", \
58 __FILE__, __LINE__, #x); \
61 #define PT64_PT_BITS 9
62 #define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
63 #define PT32_PT_BITS 10
64 #define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
66 #define PT_WRITABLE_SHIFT 1
68 #define PT_PRESENT_MASK (1ULL << 0)
69 #define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
70 #define PT_USER_MASK (1ULL << 2)
71 #define PT_PWT_MASK (1ULL << 3)
72 #define PT_PCD_MASK (1ULL << 4)
73 #define PT_ACCESSED_MASK (1ULL << 5)
74 #define PT_DIRTY_MASK (1ULL << 6)
75 #define PT_PAGE_SIZE_MASK (1ULL << 7)
76 #define PT_PAT_MASK (1ULL << 7)
77 #define PT_GLOBAL_MASK (1ULL << 8)
78 #define PT64_NX_MASK (1ULL << 63)
80 #define PT_PAT_SHIFT 7
81 #define PT_DIR_PAT_SHIFT 12
82 #define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
84 #define PT32_DIR_PSE36_SIZE 4
85 #define PT32_DIR_PSE36_SHIFT 13
86 #define PT32_DIR_PSE36_MASK (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
89 #define PT32_PTE_COPY_MASK \
90 (PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK | PT_GLOBAL_MASK)
92 #define PT64_PTE_COPY_MASK (PT64_NX_MASK | PT32_PTE_COPY_MASK)
94 #define PT_FIRST_AVAIL_BITS_SHIFT 9
95 #define PT64_SECOND_AVAIL_BITS_SHIFT 52
97 #define PT_SHADOW_PS_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
98 #define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
100 #define PT_SHADOW_WRITABLE_SHIFT (PT_FIRST_AVAIL_BITS_SHIFT + 1)
101 #define PT_SHADOW_WRITABLE_MASK (1ULL << PT_SHADOW_WRITABLE_SHIFT)
103 #define PT_SHADOW_USER_SHIFT (PT_SHADOW_WRITABLE_SHIFT + 1)
104 #define PT_SHADOW_USER_MASK (1ULL << (PT_SHADOW_USER_SHIFT))
106 #define PT_SHADOW_BITS_OFFSET (PT_SHADOW_WRITABLE_SHIFT - PT_WRITABLE_SHIFT)
108 #define VALID_PAGE(x) ((x) != INVALID_PAGE)
110 #define PT64_LEVEL_BITS 9
112 #define PT64_LEVEL_SHIFT(level) \
113 ( PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS )
115 #define PT64_LEVEL_MASK(level) \
116 (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
118 #define PT64_INDEX(address, level)\
119 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
122 #define PT32_LEVEL_BITS 10
124 #define PT32_LEVEL_SHIFT(level) \
125 ( PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS )
127 #define PT32_LEVEL_MASK(level) \
128 (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
130 #define PT32_INDEX(address, level)\
131 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
134 #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
135 #define PT64_DIR_BASE_ADDR_MASK \
136 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
138 #define PT32_BASE_ADDR_MASK PAGE_MASK
139 #define PT32_DIR_BASE_ADDR_MASK \
140 (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
143 #define PFERR_PRESENT_MASK (1U << 0)
144 #define PFERR_WRITE_MASK (1U << 1)
145 #define PFERR_USER_MASK (1U << 2)
146 #define PFERR_FETCH_MASK (1U << 4)
148 #define PT64_ROOT_LEVEL 4
149 #define PT32_ROOT_LEVEL 2
150 #define PT32E_ROOT_LEVEL 3
152 #define PT_DIRECTORY_LEVEL 2
153 #define PT_PAGE_TABLE_LEVEL 1
157 struct kvm_rmap_desc {
158 u64 *shadow_ptes[RMAP_EXT];
159 struct kvm_rmap_desc *more;
162 static struct kmem_cache *pte_chain_cache;
163 static struct kmem_cache *rmap_desc_cache;
165 static int is_write_protection(struct kvm_vcpu *vcpu)
167 return vcpu->cr0 & CR0_WP_MASK;
170 static int is_cpuid_PSE36(void)
175 static int is_nx(struct kvm_vcpu *vcpu)
177 return vcpu->shadow_efer & EFER_NX;
180 static int is_present_pte(unsigned long pte)
182 return pte & PT_PRESENT_MASK;
185 static int is_writeble_pte(unsigned long pte)
187 return pte & PT_WRITABLE_MASK;
190 static int is_io_pte(unsigned long pte)
192 return pte & PT_SHADOW_IO_MARK;
195 static int is_rmap_pte(u64 pte)
197 return (pte & (PT_WRITABLE_MASK | PT_PRESENT_MASK))
198 == (PT_WRITABLE_MASK | PT_PRESENT_MASK);
201 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
202 struct kmem_cache *base_cache, int min,
207 if (cache->nobjs >= min)
209 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
210 obj = kmem_cache_zalloc(base_cache, gfp_flags);
213 cache->objects[cache->nobjs++] = obj;
218 static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
221 kfree(mc->objects[--mc->nobjs]);
224 static int __mmu_topup_memory_caches(struct kvm_vcpu *vcpu, gfp_t gfp_flags)
228 r = mmu_topup_memory_cache(&vcpu->mmu_pte_chain_cache,
229 pte_chain_cache, 4, gfp_flags);
232 r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache,
233 rmap_desc_cache, 1, gfp_flags);
238 static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
242 r = __mmu_topup_memory_caches(vcpu, GFP_NOWAIT);
244 spin_unlock(&vcpu->kvm->lock);
245 kvm_arch_ops->vcpu_put(vcpu);
246 r = __mmu_topup_memory_caches(vcpu, GFP_KERNEL);
247 kvm_arch_ops->vcpu_load(vcpu);
248 spin_lock(&vcpu->kvm->lock);
253 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
255 mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache);
256 mmu_free_memory_cache(&vcpu->mmu_rmap_desc_cache);
259 static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
265 p = mc->objects[--mc->nobjs];
270 static void mmu_memory_cache_free(struct kvm_mmu_memory_cache *mc, void *obj)
272 if (mc->nobjs < KVM_NR_MEM_OBJS)
273 mc->objects[mc->nobjs++] = obj;
278 static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
280 return mmu_memory_cache_alloc(&vcpu->mmu_pte_chain_cache,
281 sizeof(struct kvm_pte_chain));
284 static void mmu_free_pte_chain(struct kvm_vcpu *vcpu,
285 struct kvm_pte_chain *pc)
287 mmu_memory_cache_free(&vcpu->mmu_pte_chain_cache, pc);
290 static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
292 return mmu_memory_cache_alloc(&vcpu->mmu_rmap_desc_cache,
293 sizeof(struct kvm_rmap_desc));
296 static void mmu_free_rmap_desc(struct kvm_vcpu *vcpu,
297 struct kvm_rmap_desc *rd)
299 mmu_memory_cache_free(&vcpu->mmu_rmap_desc_cache, rd);
303 * Reverse mapping data structures:
305 * If page->private bit zero is zero, then page->private points to the
306 * shadow page table entry that points to page_address(page).
308 * If page->private bit zero is one, (then page->private & ~1) points
309 * to a struct kvm_rmap_desc containing more mappings.
311 static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte)
314 struct kvm_rmap_desc *desc;
317 if (!is_rmap_pte(*spte))
319 page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
320 if (!page_private(page)) {
321 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
322 set_page_private(page,(unsigned long)spte);
323 } else if (!(page_private(page) & 1)) {
324 rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
325 desc = mmu_alloc_rmap_desc(vcpu);
326 desc->shadow_ptes[0] = (u64 *)page_private(page);
327 desc->shadow_ptes[1] = spte;
328 set_page_private(page,(unsigned long)desc | 1);
330 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
331 desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
332 while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
334 if (desc->shadow_ptes[RMAP_EXT-1]) {
335 desc->more = mmu_alloc_rmap_desc(vcpu);
338 for (i = 0; desc->shadow_ptes[i]; ++i)
340 desc->shadow_ptes[i] = spte;
344 static void rmap_desc_remove_entry(struct kvm_vcpu *vcpu,
346 struct kvm_rmap_desc *desc,
348 struct kvm_rmap_desc *prev_desc)
352 for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
354 desc->shadow_ptes[i] = desc->shadow_ptes[j];
355 desc->shadow_ptes[j] = NULL;
358 if (!prev_desc && !desc->more)
359 set_page_private(page,(unsigned long)desc->shadow_ptes[0]);
362 prev_desc->more = desc->more;
364 set_page_private(page,(unsigned long)desc->more | 1);
365 mmu_free_rmap_desc(vcpu, desc);
368 static void rmap_remove(struct kvm_vcpu *vcpu, u64 *spte)
371 struct kvm_rmap_desc *desc;
372 struct kvm_rmap_desc *prev_desc;
375 if (!is_rmap_pte(*spte))
377 page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
378 if (!page_private(page)) {
379 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
381 } else if (!(page_private(page) & 1)) {
382 rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte);
383 if ((u64 *)page_private(page) != spte) {
384 printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n",
388 set_page_private(page,0);
390 rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte);
391 desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
394 for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
395 if (desc->shadow_ptes[i] == spte) {
396 rmap_desc_remove_entry(vcpu, page,
408 static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
410 struct kvm *kvm = vcpu->kvm;
412 struct kvm_rmap_desc *desc;
415 page = gfn_to_page(kvm, gfn);
418 while (page_private(page)) {
419 if (!(page_private(page) & 1))
420 spte = (u64 *)page_private(page);
422 desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
423 spte = desc->shadow_ptes[0];
426 BUG_ON((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT
427 != page_to_pfn(page));
428 BUG_ON(!(*spte & PT_PRESENT_MASK));
429 BUG_ON(!(*spte & PT_WRITABLE_MASK));
430 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
431 rmap_remove(vcpu, spte);
432 kvm_arch_ops->tlb_flush(vcpu);
433 *spte &= ~(u64)PT_WRITABLE_MASK;
437 static int is_empty_shadow_page(hpa_t page_hpa)
442 for (pos = __va(page_hpa), end = pos + PAGE_SIZE / sizeof(u64);
445 printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
452 static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, hpa_t page_hpa)
454 struct kvm_mmu_page *page_head = page_header(page_hpa);
456 ASSERT(is_empty_shadow_page(page_hpa));
457 page_head->page_hpa = page_hpa;
458 list_move(&page_head->link, &vcpu->free_pages);
459 ++vcpu->kvm->n_free_mmu_pages;
462 static unsigned kvm_page_table_hashfn(gfn_t gfn)
467 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
470 struct kvm_mmu_page *page;
472 if (list_empty(&vcpu->free_pages))
475 page = list_entry(vcpu->free_pages.next, struct kvm_mmu_page, link);
476 list_move(&page->link, &vcpu->kvm->active_mmu_pages);
477 ASSERT(is_empty_shadow_page(page->page_hpa));
478 page->slot_bitmap = 0;
479 page->multimapped = 0;
480 page->parent_pte = parent_pte;
481 --vcpu->kvm->n_free_mmu_pages;
485 static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
486 struct kvm_mmu_page *page, u64 *parent_pte)
488 struct kvm_pte_chain *pte_chain;
489 struct hlist_node *node;
494 if (!page->multimapped) {
495 u64 *old = page->parent_pte;
498 page->parent_pte = parent_pte;
501 page->multimapped = 1;
502 pte_chain = mmu_alloc_pte_chain(vcpu);
503 INIT_HLIST_HEAD(&page->parent_ptes);
504 hlist_add_head(&pte_chain->link, &page->parent_ptes);
505 pte_chain->parent_ptes[0] = old;
507 hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link) {
508 if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
510 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
511 if (!pte_chain->parent_ptes[i]) {
512 pte_chain->parent_ptes[i] = parent_pte;
516 pte_chain = mmu_alloc_pte_chain(vcpu);
518 hlist_add_head(&pte_chain->link, &page->parent_ptes);
519 pte_chain->parent_ptes[0] = parent_pte;
522 static void mmu_page_remove_parent_pte(struct kvm_vcpu *vcpu,
523 struct kvm_mmu_page *page,
526 struct kvm_pte_chain *pte_chain;
527 struct hlist_node *node;
530 if (!page->multimapped) {
531 BUG_ON(page->parent_pte != parent_pte);
532 page->parent_pte = NULL;
535 hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link)
536 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
537 if (!pte_chain->parent_ptes[i])
539 if (pte_chain->parent_ptes[i] != parent_pte)
541 while (i + 1 < NR_PTE_CHAIN_ENTRIES
542 && pte_chain->parent_ptes[i + 1]) {
543 pte_chain->parent_ptes[i]
544 = pte_chain->parent_ptes[i + 1];
547 pte_chain->parent_ptes[i] = NULL;
549 hlist_del(&pte_chain->link);
550 mmu_free_pte_chain(vcpu, pte_chain);
551 if (hlist_empty(&page->parent_ptes)) {
552 page->multimapped = 0;
553 page->parent_pte = NULL;
561 static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm_vcpu *vcpu,
565 struct hlist_head *bucket;
566 struct kvm_mmu_page *page;
567 struct hlist_node *node;
569 pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
570 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
571 bucket = &vcpu->kvm->mmu_page_hash[index];
572 hlist_for_each_entry(page, node, bucket, hash_link)
573 if (page->gfn == gfn && !page->role.metaphysical) {
574 pgprintk("%s: found role %x\n",
575 __FUNCTION__, page->role.word);
581 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
586 unsigned hugepage_access,
589 union kvm_mmu_page_role role;
592 struct hlist_head *bucket;
593 struct kvm_mmu_page *page;
594 struct hlist_node *node;
597 role.glevels = vcpu->mmu.root_level;
599 role.metaphysical = metaphysical;
600 role.hugepage_access = hugepage_access;
601 if (vcpu->mmu.root_level <= PT32_ROOT_LEVEL) {
602 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
603 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
604 role.quadrant = quadrant;
606 pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__,
608 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
609 bucket = &vcpu->kvm->mmu_page_hash[index];
610 hlist_for_each_entry(page, node, bucket, hash_link)
611 if (page->gfn == gfn && page->role.word == role.word) {
612 mmu_page_add_parent_pte(vcpu, page, parent_pte);
613 pgprintk("%s: found\n", __FUNCTION__);
616 page = kvm_mmu_alloc_page(vcpu, parent_pte);
619 pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word);
622 hlist_add_head(&page->hash_link, bucket);
624 rmap_write_protect(vcpu, gfn);
628 static void kvm_mmu_page_unlink_children(struct kvm_vcpu *vcpu,
629 struct kvm_mmu_page *page)
635 pt = __va(page->page_hpa);
637 if (page->role.level == PT_PAGE_TABLE_LEVEL) {
638 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
639 if (pt[i] & PT_PRESENT_MASK)
640 rmap_remove(vcpu, &pt[i]);
643 kvm_arch_ops->tlb_flush(vcpu);
647 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
651 if (!(ent & PT_PRESENT_MASK))
653 ent &= PT64_BASE_ADDR_MASK;
654 mmu_page_remove_parent_pte(vcpu, page_header(ent), &pt[i]);
658 static void kvm_mmu_put_page(struct kvm_vcpu *vcpu,
659 struct kvm_mmu_page *page,
662 mmu_page_remove_parent_pte(vcpu, page, parent_pte);
665 static void kvm_mmu_zap_page(struct kvm_vcpu *vcpu,
666 struct kvm_mmu_page *page)
670 while (page->multimapped || page->parent_pte) {
671 if (!page->multimapped)
672 parent_pte = page->parent_pte;
674 struct kvm_pte_chain *chain;
676 chain = container_of(page->parent_ptes.first,
677 struct kvm_pte_chain, link);
678 parent_pte = chain->parent_ptes[0];
681 kvm_mmu_put_page(vcpu, page, parent_pte);
684 kvm_mmu_page_unlink_children(vcpu, page);
685 if (!page->root_count) {
686 hlist_del(&page->hash_link);
687 kvm_mmu_free_page(vcpu, page->page_hpa);
689 list_move(&page->link, &vcpu->kvm->active_mmu_pages);
692 static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn)
695 struct hlist_head *bucket;
696 struct kvm_mmu_page *page;
697 struct hlist_node *node, *n;
700 pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
702 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
703 bucket = &vcpu->kvm->mmu_page_hash[index];
704 hlist_for_each_entry_safe(page, node, n, bucket, hash_link)
705 if (page->gfn == gfn && !page->role.metaphysical) {
706 pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn,
708 kvm_mmu_zap_page(vcpu, page);
714 static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa)
716 int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT));
717 struct kvm_mmu_page *page_head = page_header(__pa(pte));
719 __set_bit(slot, &page_head->slot_bitmap);
722 hpa_t safe_gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
724 hpa_t hpa = gpa_to_hpa(vcpu, gpa);
726 return is_error_hpa(hpa) ? bad_page_address | (gpa & ~PAGE_MASK): hpa;
729 hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
733 ASSERT((gpa & HPA_ERR_MASK) == 0);
734 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
736 return gpa | HPA_ERR_MASK;
737 return ((hpa_t)page_to_pfn(page) << PAGE_SHIFT)
738 | (gpa & (PAGE_SIZE-1));
741 hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva)
743 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
745 if (gpa == UNMAPPED_GVA)
747 return gpa_to_hpa(vcpu, gpa);
750 struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
752 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
754 if (gpa == UNMAPPED_GVA)
756 return pfn_to_page(gpa_to_hpa(vcpu, gpa) >> PAGE_SHIFT);
759 static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
763 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p)
765 int level = PT32E_ROOT_LEVEL;
766 hpa_t table_addr = vcpu->mmu.root_hpa;
769 u32 index = PT64_INDEX(v, level);
773 ASSERT(VALID_PAGE(table_addr));
774 table = __va(table_addr);
778 if (is_present_pte(pte) && is_writeble_pte(pte))
780 mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT);
781 page_header_update_slot(vcpu->kvm, table, v);
782 table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK |
784 rmap_add(vcpu, &table[index]);
788 if (table[index] == 0) {
789 struct kvm_mmu_page *new_table;
792 pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
794 new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
796 1, 0, &table[index]);
798 pgprintk("nonpaging_map: ENOMEM\n");
802 table[index] = new_table->page_hpa | PT_PRESENT_MASK
803 | PT_WRITABLE_MASK | PT_USER_MASK;
805 table_addr = table[index] & PT64_BASE_ADDR_MASK;
809 static void mmu_free_roots(struct kvm_vcpu *vcpu)
812 struct kvm_mmu_page *page;
815 if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
816 hpa_t root = vcpu->mmu.root_hpa;
818 ASSERT(VALID_PAGE(root));
819 page = page_header(root);
821 vcpu->mmu.root_hpa = INVALID_PAGE;
825 for (i = 0; i < 4; ++i) {
826 hpa_t root = vcpu->mmu.pae_root[i];
829 ASSERT(VALID_PAGE(root));
830 root &= PT64_BASE_ADDR_MASK;
831 page = page_header(root);
834 vcpu->mmu.pae_root[i] = INVALID_PAGE;
836 vcpu->mmu.root_hpa = INVALID_PAGE;
839 static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
843 struct kvm_mmu_page *page;
845 root_gfn = vcpu->cr3 >> PAGE_SHIFT;
848 if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
849 hpa_t root = vcpu->mmu.root_hpa;
851 ASSERT(!VALID_PAGE(root));
852 page = kvm_mmu_get_page(vcpu, root_gfn, 0,
853 PT64_ROOT_LEVEL, 0, 0, NULL);
854 root = page->page_hpa;
856 vcpu->mmu.root_hpa = root;
860 for (i = 0; i < 4; ++i) {
861 hpa_t root = vcpu->mmu.pae_root[i];
863 ASSERT(!VALID_PAGE(root));
864 if (vcpu->mmu.root_level == PT32E_ROOT_LEVEL) {
865 if (!is_present_pte(vcpu->pdptrs[i])) {
866 vcpu->mmu.pae_root[i] = 0;
869 root_gfn = vcpu->pdptrs[i] >> PAGE_SHIFT;
870 } else if (vcpu->mmu.root_level == 0)
872 page = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
873 PT32_ROOT_LEVEL, !is_paging(vcpu),
875 root = page->page_hpa;
877 vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK;
879 vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root);
882 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
887 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
894 r = mmu_topup_memory_caches(vcpu);
899 ASSERT(VALID_PAGE(vcpu->mmu.root_hpa));
902 paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK);
904 if (is_error_hpa(paddr))
907 return nonpaging_map(vcpu, addr & PAGE_MASK, paddr);
910 static void nonpaging_free(struct kvm_vcpu *vcpu)
912 mmu_free_roots(vcpu);
915 static int nonpaging_init_context(struct kvm_vcpu *vcpu)
917 struct kvm_mmu *context = &vcpu->mmu;
919 context->new_cr3 = nonpaging_new_cr3;
920 context->page_fault = nonpaging_page_fault;
921 context->gva_to_gpa = nonpaging_gva_to_gpa;
922 context->free = nonpaging_free;
923 context->root_level = 0;
924 context->shadow_root_level = PT32E_ROOT_LEVEL;
925 mmu_alloc_roots(vcpu);
926 ASSERT(VALID_PAGE(context->root_hpa));
927 kvm_arch_ops->set_cr3(vcpu, context->root_hpa);
931 static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
933 ++kvm_stat.tlb_flush;
934 kvm_arch_ops->tlb_flush(vcpu);
937 static void paging_new_cr3(struct kvm_vcpu *vcpu)
939 pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3);
940 mmu_free_roots(vcpu);
941 if (unlikely(vcpu->kvm->n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
942 kvm_mmu_free_some_pages(vcpu);
943 mmu_alloc_roots(vcpu);
944 kvm_mmu_flush_tlb(vcpu);
945 kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa);
948 static inline void set_pte_common(struct kvm_vcpu *vcpu,
957 *shadow_pte |= access_bits << PT_SHADOW_BITS_OFFSET;
959 access_bits &= ~PT_WRITABLE_MASK;
961 paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK);
963 *shadow_pte |= access_bits;
965 if (is_error_hpa(paddr)) {
966 *shadow_pte |= gaddr;
967 *shadow_pte |= PT_SHADOW_IO_MARK;
968 *shadow_pte &= ~PT_PRESENT_MASK;
972 *shadow_pte |= paddr;
974 if (access_bits & PT_WRITABLE_MASK) {
975 struct kvm_mmu_page *shadow;
977 shadow = kvm_mmu_lookup_page(vcpu, gfn);
979 pgprintk("%s: found shadow page for %lx, marking ro\n",
981 access_bits &= ~PT_WRITABLE_MASK;
982 if (is_writeble_pte(*shadow_pte)) {
983 *shadow_pte &= ~PT_WRITABLE_MASK;
984 kvm_arch_ops->tlb_flush(vcpu);
989 if (access_bits & PT_WRITABLE_MASK)
990 mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT);
992 page_header_update_slot(vcpu->kvm, shadow_pte, gaddr);
993 rmap_add(vcpu, shadow_pte);
996 static void inject_page_fault(struct kvm_vcpu *vcpu,
1000 kvm_arch_ops->inject_page_fault(vcpu, addr, err_code);
1003 static inline int fix_read_pf(u64 *shadow_ent)
1005 if ((*shadow_ent & PT_SHADOW_USER_MASK) &&
1006 !(*shadow_ent & PT_USER_MASK)) {
1008 * If supervisor write protect is disabled, we shadow kernel
1009 * pages as user pages so we can trap the write access.
1011 *shadow_ent |= PT_USER_MASK;
1012 *shadow_ent &= ~PT_WRITABLE_MASK;
1020 static void paging_free(struct kvm_vcpu *vcpu)
1022 nonpaging_free(vcpu);
1026 #include "paging_tmpl.h"
1030 #include "paging_tmpl.h"
1033 static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
1035 struct kvm_mmu *context = &vcpu->mmu;
1037 ASSERT(is_pae(vcpu));
1038 context->new_cr3 = paging_new_cr3;
1039 context->page_fault = paging64_page_fault;
1040 context->gva_to_gpa = paging64_gva_to_gpa;
1041 context->free = paging_free;
1042 context->root_level = level;
1043 context->shadow_root_level = level;
1044 mmu_alloc_roots(vcpu);
1045 ASSERT(VALID_PAGE(context->root_hpa));
1046 kvm_arch_ops->set_cr3(vcpu, context->root_hpa |
1047 (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK)));
1051 static int paging64_init_context(struct kvm_vcpu *vcpu)
1053 return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
1056 static int paging32_init_context(struct kvm_vcpu *vcpu)
1058 struct kvm_mmu *context = &vcpu->mmu;
1060 context->new_cr3 = paging_new_cr3;
1061 context->page_fault = paging32_page_fault;
1062 context->gva_to_gpa = paging32_gva_to_gpa;
1063 context->free = paging_free;
1064 context->root_level = PT32_ROOT_LEVEL;
1065 context->shadow_root_level = PT32E_ROOT_LEVEL;
1066 mmu_alloc_roots(vcpu);
1067 ASSERT(VALID_PAGE(context->root_hpa));
1068 kvm_arch_ops->set_cr3(vcpu, context->root_hpa |
1069 (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK)));
1073 static int paging32E_init_context(struct kvm_vcpu *vcpu)
1075 return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
1078 static int init_kvm_mmu(struct kvm_vcpu *vcpu)
1081 ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
1083 if (!is_paging(vcpu))
1084 return nonpaging_init_context(vcpu);
1085 else if (is_long_mode(vcpu))
1086 return paging64_init_context(vcpu);
1087 else if (is_pae(vcpu))
1088 return paging32E_init_context(vcpu);
1090 return paging32_init_context(vcpu);
1093 static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
1096 if (VALID_PAGE(vcpu->mmu.root_hpa)) {
1097 vcpu->mmu.free(vcpu);
1098 vcpu->mmu.root_hpa = INVALID_PAGE;
1102 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
1106 destroy_kvm_mmu(vcpu);
1107 r = init_kvm_mmu(vcpu);
1110 r = mmu_topup_memory_caches(vcpu);
1115 static void mmu_pre_write_zap_pte(struct kvm_vcpu *vcpu,
1116 struct kvm_mmu_page *page,
1120 struct kvm_mmu_page *child;
1123 if (is_present_pte(pte)) {
1124 if (page->role.level == PT_PAGE_TABLE_LEVEL)
1125 rmap_remove(vcpu, spte);
1127 child = page_header(pte & PT64_BASE_ADDR_MASK);
1128 mmu_page_remove_parent_pte(vcpu, child, spte);
1134 void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes)
1136 gfn_t gfn = gpa >> PAGE_SHIFT;
1137 struct kvm_mmu_page *page;
1138 struct hlist_node *node, *n;
1139 struct hlist_head *bucket;
1142 unsigned offset = offset_in_page(gpa);
1144 unsigned page_offset;
1145 unsigned misaligned;
1150 pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
1151 if (gfn == vcpu->last_pt_write_gfn) {
1152 ++vcpu->last_pt_write_count;
1153 if (vcpu->last_pt_write_count >= 3)
1156 vcpu->last_pt_write_gfn = gfn;
1157 vcpu->last_pt_write_count = 1;
1159 index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
1160 bucket = &vcpu->kvm->mmu_page_hash[index];
1161 hlist_for_each_entry_safe(page, node, n, bucket, hash_link) {
1162 if (page->gfn != gfn || page->role.metaphysical)
1164 pte_size = page->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
1165 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
1166 if (misaligned || flooded) {
1168 * Misaligned accesses are too much trouble to fix
1169 * up; also, they usually indicate a page is not used
1172 * If we're seeing too many writes to a page,
1173 * it may no longer be a page table, or we may be
1174 * forking, in which case it is better to unmap the
1177 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
1178 gpa, bytes, page->role.word);
1179 kvm_mmu_zap_page(vcpu, page);
1182 page_offset = offset;
1183 level = page->role.level;
1185 if (page->role.glevels == PT32_ROOT_LEVEL) {
1186 page_offset <<= 1; /* 32->64 */
1188 * A 32-bit pde maps 4MB while the shadow pdes map
1189 * only 2MB. So we need to double the offset again
1190 * and zap two pdes instead of one.
1192 if (level == PT32_ROOT_LEVEL) {
1193 page_offset &= ~7; /* kill rounding error */
1197 page_offset &= ~PAGE_MASK;
1199 spte = __va(page->page_hpa);
1200 spte += page_offset / sizeof(*spte);
1202 mmu_pre_write_zap_pte(vcpu, page, spte);
1208 void kvm_mmu_post_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes)
1212 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
1214 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
1216 return kvm_mmu_unprotect_page(vcpu, gpa >> PAGE_SHIFT);
1219 void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
1221 while (vcpu->kvm->n_free_mmu_pages < KVM_REFILL_PAGES) {
1222 struct kvm_mmu_page *page;
1224 page = container_of(vcpu->kvm->active_mmu_pages.prev,
1225 struct kvm_mmu_page, link);
1226 kvm_mmu_zap_page(vcpu, page);
1229 EXPORT_SYMBOL_GPL(kvm_mmu_free_some_pages);
1231 static void free_mmu_pages(struct kvm_vcpu *vcpu)
1233 struct kvm_mmu_page *page;
1235 while (!list_empty(&vcpu->kvm->active_mmu_pages)) {
1236 page = container_of(vcpu->kvm->active_mmu_pages.next,
1237 struct kvm_mmu_page, link);
1238 kvm_mmu_zap_page(vcpu, page);
1240 while (!list_empty(&vcpu->free_pages)) {
1241 page = list_entry(vcpu->free_pages.next,
1242 struct kvm_mmu_page, link);
1243 list_del(&page->link);
1244 __free_page(pfn_to_page(page->page_hpa >> PAGE_SHIFT));
1245 page->page_hpa = INVALID_PAGE;
1247 free_page((unsigned long)vcpu->mmu.pae_root);
1250 static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
1257 for (i = 0; i < KVM_NUM_MMU_PAGES; i++) {
1258 struct kvm_mmu_page *page_header = &vcpu->page_header_buf[i];
1260 INIT_LIST_HEAD(&page_header->link);
1261 if ((page = alloc_page(GFP_KERNEL)) == NULL)
1263 set_page_private(page, (unsigned long)page_header);
1264 page_header->page_hpa = (hpa_t)page_to_pfn(page) << PAGE_SHIFT;
1265 memset(__va(page_header->page_hpa), 0, PAGE_SIZE);
1266 list_add(&page_header->link, &vcpu->free_pages);
1267 ++vcpu->kvm->n_free_mmu_pages;
1271 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
1272 * Therefore we need to allocate shadow page tables in the first
1273 * 4GB of memory, which happens to fit the DMA32 zone.
1275 page = alloc_page(GFP_KERNEL | __GFP_DMA32);
1278 vcpu->mmu.pae_root = page_address(page);
1279 for (i = 0; i < 4; ++i)
1280 vcpu->mmu.pae_root[i] = INVALID_PAGE;
1285 free_mmu_pages(vcpu);
1289 int kvm_mmu_create(struct kvm_vcpu *vcpu)
1292 ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
1293 ASSERT(list_empty(&vcpu->free_pages));
1295 return alloc_mmu_pages(vcpu);
1298 int kvm_mmu_setup(struct kvm_vcpu *vcpu)
1301 ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
1302 ASSERT(!list_empty(&vcpu->free_pages));
1304 return init_kvm_mmu(vcpu);
1307 void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
1311 destroy_kvm_mmu(vcpu);
1312 free_mmu_pages(vcpu);
1313 mmu_free_memory_caches(vcpu);
1316 void kvm_mmu_slot_remove_write_access(struct kvm_vcpu *vcpu, int slot)
1318 struct kvm *kvm = vcpu->kvm;
1319 struct kvm_mmu_page *page;
1321 list_for_each_entry(page, &kvm->active_mmu_pages, link) {
1325 if (!test_bit(slot, &page->slot_bitmap))
1328 pt = __va(page->page_hpa);
1329 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1331 if (pt[i] & PT_WRITABLE_MASK) {
1332 rmap_remove(vcpu, &pt[i]);
1333 pt[i] &= ~PT_WRITABLE_MASK;
1338 void kvm_mmu_zap_all(struct kvm_vcpu *vcpu)
1340 destroy_kvm_mmu(vcpu);
1342 while (!list_empty(&vcpu->kvm->active_mmu_pages)) {
1343 struct kvm_mmu_page *page;
1345 page = container_of(vcpu->kvm->active_mmu_pages.next,
1346 struct kvm_mmu_page, link);
1347 kvm_mmu_zap_page(vcpu, page);
1350 mmu_free_memory_caches(vcpu);
1351 kvm_arch_ops->tlb_flush(vcpu);
1355 void kvm_mmu_module_exit(void)
1357 if (pte_chain_cache)
1358 kmem_cache_destroy(pte_chain_cache);
1359 if (rmap_desc_cache)
1360 kmem_cache_destroy(rmap_desc_cache);
1363 int kvm_mmu_module_init(void)
1365 pte_chain_cache = kmem_cache_create("kvm_pte_chain",
1366 sizeof(struct kvm_pte_chain),
1368 if (!pte_chain_cache)
1370 rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
1371 sizeof(struct kvm_rmap_desc),
1373 if (!rmap_desc_cache)
1379 kvm_mmu_module_exit();
1385 static const char *audit_msg;
1387 static gva_t canonicalize(gva_t gva)
1389 #ifdef CONFIG_X86_64
1390 gva = (long long)(gva << 16) >> 16;
1395 static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
1396 gva_t va, int level)
1398 u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
1400 gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
1402 for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
1405 if (!ent & PT_PRESENT_MASK)
1408 va = canonicalize(va);
1410 audit_mappings_page(vcpu, ent, va, level - 1);
1412 gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, va);
1413 hpa_t hpa = gpa_to_hpa(vcpu, gpa);
1415 if ((ent & PT_PRESENT_MASK)
1416 && (ent & PT64_BASE_ADDR_MASK) != hpa)
1417 printk(KERN_ERR "audit error: (%s) levels %d"
1418 " gva %lx gpa %llx hpa %llx ent %llx\n",
1419 audit_msg, vcpu->mmu.root_level,
1425 static void audit_mappings(struct kvm_vcpu *vcpu)
1429 if (vcpu->mmu.root_level == 4)
1430 audit_mappings_page(vcpu, vcpu->mmu.root_hpa, 0, 4);
1432 for (i = 0; i < 4; ++i)
1433 if (vcpu->mmu.pae_root[i] & PT_PRESENT_MASK)
1434 audit_mappings_page(vcpu,
1435 vcpu->mmu.pae_root[i],
1440 static int count_rmaps(struct kvm_vcpu *vcpu)
1445 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
1446 struct kvm_memory_slot *m = &vcpu->kvm->memslots[i];
1447 struct kvm_rmap_desc *d;
1449 for (j = 0; j < m->npages; ++j) {
1450 struct page *page = m->phys_mem[j];
1454 if (!(page->private & 1)) {
1458 d = (struct kvm_rmap_desc *)(page->private & ~1ul);
1460 for (k = 0; k < RMAP_EXT; ++k)
1461 if (d->shadow_ptes[k])
1472 static int count_writable_mappings(struct kvm_vcpu *vcpu)
1475 struct kvm_mmu_page *page;
1478 list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
1479 u64 *pt = __va(page->page_hpa);
1481 if (page->role.level != PT_PAGE_TABLE_LEVEL)
1484 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1487 if (!(ent & PT_PRESENT_MASK))
1489 if (!(ent & PT_WRITABLE_MASK))
1497 static void audit_rmap(struct kvm_vcpu *vcpu)
1499 int n_rmap = count_rmaps(vcpu);
1500 int n_actual = count_writable_mappings(vcpu);
1502 if (n_rmap != n_actual)
1503 printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
1504 __FUNCTION__, audit_msg, n_rmap, n_actual);
1507 static void audit_write_protection(struct kvm_vcpu *vcpu)
1509 struct kvm_mmu_page *page;
1511 list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
1515 if (page->role.metaphysical)
1518 hfn = gpa_to_hpa(vcpu, (gpa_t)page->gfn << PAGE_SHIFT)
1520 pg = pfn_to_page(hfn);
1522 printk(KERN_ERR "%s: (%s) shadow page has writable"
1523 " mappings: gfn %lx role %x\n",
1524 __FUNCTION__, audit_msg, page->gfn,
1529 static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
1536 audit_write_protection(vcpu);
1537 audit_mappings(vcpu);