]> err.no Git - linux-2.6/blob - drivers/kvm/mmu.c
KVM: MMU: Simpify accessed/dirty/present/nx bit handling
[linux-2.6] / drivers / kvm / mmu.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * MMU support
8  *
9  * Copyright (C) 2006 Qumranet, Inc.
10  *
11  * Authors:
12  *   Yaniv Kamay  <yaniv@qumranet.com>
13  *   Avi Kivity   <avi@qumranet.com>
14  *
15  * This work is licensed under the terms of the GNU GPL, version 2.  See
16  * the COPYING file in the top-level directory.
17  *
18  */
19 #include <linux/types.h>
20 #include <linux/string.h>
21 #include <asm/page.h>
22 #include <linux/mm.h>
23 #include <linux/highmem.h>
24 #include <linux/module.h>
25 #include <asm/cmpxchg.h>
26
27 #include "vmx.h"
28 #include "kvm.h"
29
30 #undef MMU_DEBUG
31
32 #undef AUDIT
33
34 #ifdef AUDIT
35 static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
36 #else
37 static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
38 #endif
39
40 #ifdef MMU_DEBUG
41
42 #define pgprintk(x...) do { if (dbg) printk(x); } while (0)
43 #define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
44
45 #else
46
47 #define pgprintk(x...) do { } while (0)
48 #define rmap_printk(x...) do { } while (0)
49
50 #endif
51
52 #if defined(MMU_DEBUG) || defined(AUDIT)
53 static int dbg = 1;
54 #endif
55
56 #ifndef MMU_DEBUG
57 #define ASSERT(x) do { } while (0)
58 #else
59 #define ASSERT(x)                                                       \
60         if (!(x)) {                                                     \
61                 printk(KERN_WARNING "assertion failed %s:%d: %s\n",     \
62                        __FILE__, __LINE__, #x);                         \
63         }
64 #endif
65
66 #define PT64_PT_BITS 9
67 #define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
68 #define PT32_PT_BITS 10
69 #define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
70
71 #define PT_WRITABLE_SHIFT 1
72
73 #define PT_PRESENT_MASK (1ULL << 0)
74 #define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
75 #define PT_USER_MASK (1ULL << 2)
76 #define PT_PWT_MASK (1ULL << 3)
77 #define PT_PCD_MASK (1ULL << 4)
78 #define PT_ACCESSED_MASK (1ULL << 5)
79 #define PT_DIRTY_MASK (1ULL << 6)
80 #define PT_PAGE_SIZE_MASK (1ULL << 7)
81 #define PT_PAT_MASK (1ULL << 7)
82 #define PT_GLOBAL_MASK (1ULL << 8)
83 #define PT64_NX_MASK (1ULL << 63)
84
85 #define PT_PAT_SHIFT 7
86 #define PT_DIR_PAT_SHIFT 12
87 #define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
88
89 #define PT32_DIR_PSE36_SIZE 4
90 #define PT32_DIR_PSE36_SHIFT 13
91 #define PT32_DIR_PSE36_MASK (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
92
93
94 #define PT_FIRST_AVAIL_BITS_SHIFT 9
95 #define PT64_SECOND_AVAIL_BITS_SHIFT 52
96
97 #define PT_SHADOW_PS_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
98 #define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
99
100 #define PT_SHADOW_WRITABLE_SHIFT (PT_FIRST_AVAIL_BITS_SHIFT + 1)
101 #define PT_SHADOW_WRITABLE_MASK (1ULL << PT_SHADOW_WRITABLE_SHIFT)
102
103 #define PT_SHADOW_USER_SHIFT (PT_SHADOW_WRITABLE_SHIFT + 1)
104 #define PT_SHADOW_USER_MASK (1ULL << (PT_SHADOW_USER_SHIFT))
105
106 #define PT_SHADOW_BITS_OFFSET (PT_SHADOW_WRITABLE_SHIFT - PT_WRITABLE_SHIFT)
107
108 #define VALID_PAGE(x) ((x) != INVALID_PAGE)
109
110 #define PT64_LEVEL_BITS 9
111
112 #define PT64_LEVEL_SHIFT(level) \
113                 ( PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS )
114
115 #define PT64_LEVEL_MASK(level) \
116                 (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
117
118 #define PT64_INDEX(address, level)\
119         (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
120
121
122 #define PT32_LEVEL_BITS 10
123
124 #define PT32_LEVEL_SHIFT(level) \
125                 ( PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS )
126
127 #define PT32_LEVEL_MASK(level) \
128                 (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
129
130 #define PT32_INDEX(address, level)\
131         (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
132
133
134 #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
135 #define PT64_DIR_BASE_ADDR_MASK \
136         (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
137
138 #define PT32_BASE_ADDR_MASK PAGE_MASK
139 #define PT32_DIR_BASE_ADDR_MASK \
140         (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
141
142
143 #define PFERR_PRESENT_MASK (1U << 0)
144 #define PFERR_WRITE_MASK (1U << 1)
145 #define PFERR_USER_MASK (1U << 2)
146 #define PFERR_FETCH_MASK (1U << 4)
147
148 #define PT64_ROOT_LEVEL 4
149 #define PT32_ROOT_LEVEL 2
150 #define PT32E_ROOT_LEVEL 3
151
152 #define PT_DIRECTORY_LEVEL 2
153 #define PT_PAGE_TABLE_LEVEL 1
154
155 #define RMAP_EXT 4
156
157 struct kvm_rmap_desc {
158         u64 *shadow_ptes[RMAP_EXT];
159         struct kvm_rmap_desc *more;
160 };
161
162 static struct kmem_cache *pte_chain_cache;
163 static struct kmem_cache *rmap_desc_cache;
164 static struct kmem_cache *mmu_page_cache;
165 static struct kmem_cache *mmu_page_header_cache;
166
167 static int is_write_protection(struct kvm_vcpu *vcpu)
168 {
169         return vcpu->cr0 & CR0_WP_MASK;
170 }
171
172 static int is_cpuid_PSE36(void)
173 {
174         return 1;
175 }
176
177 static int is_nx(struct kvm_vcpu *vcpu)
178 {
179         return vcpu->shadow_efer & EFER_NX;
180 }
181
182 static int is_present_pte(unsigned long pte)
183 {
184         return pte & PT_PRESENT_MASK;
185 }
186
187 static int is_writeble_pte(unsigned long pte)
188 {
189         return pte & PT_WRITABLE_MASK;
190 }
191
192 static int is_io_pte(unsigned long pte)
193 {
194         return pte & PT_SHADOW_IO_MARK;
195 }
196
197 static int is_rmap_pte(u64 pte)
198 {
199         return (pte & (PT_WRITABLE_MASK | PT_PRESENT_MASK))
200                 == (PT_WRITABLE_MASK | PT_PRESENT_MASK);
201 }
202
203 static void set_shadow_pte(u64 *sptep, u64 spte)
204 {
205 #ifdef CONFIG_X86_64
206         set_64bit((unsigned long *)sptep, spte);
207 #else
208         set_64bit((unsigned long long *)sptep, spte);
209 #endif
210 }
211
212 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
213                                   struct kmem_cache *base_cache, int min,
214                                   gfp_t gfp_flags)
215 {
216         void *obj;
217
218         if (cache->nobjs >= min)
219                 return 0;
220         while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
221                 obj = kmem_cache_zalloc(base_cache, gfp_flags);
222                 if (!obj)
223                         return -ENOMEM;
224                 cache->objects[cache->nobjs++] = obj;
225         }
226         return 0;
227 }
228
229 static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
230 {
231         while (mc->nobjs)
232                 kfree(mc->objects[--mc->nobjs]);
233 }
234
235 static int __mmu_topup_memory_caches(struct kvm_vcpu *vcpu, gfp_t gfp_flags)
236 {
237         int r;
238
239         r = mmu_topup_memory_cache(&vcpu->mmu_pte_chain_cache,
240                                    pte_chain_cache, 4, gfp_flags);
241         if (r)
242                 goto out;
243         r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache,
244                                    rmap_desc_cache, 1, gfp_flags);
245         if (r)
246                 goto out;
247         r = mmu_topup_memory_cache(&vcpu->mmu_page_cache,
248                                    mmu_page_cache, 4, gfp_flags);
249         if (r)
250                 goto out;
251         r = mmu_topup_memory_cache(&vcpu->mmu_page_header_cache,
252                                    mmu_page_header_cache, 4, gfp_flags);
253 out:
254         return r;
255 }
256
257 static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
258 {
259         int r;
260
261         r = __mmu_topup_memory_caches(vcpu, GFP_NOWAIT);
262         if (r < 0) {
263                 spin_unlock(&vcpu->kvm->lock);
264                 kvm_arch_ops->vcpu_put(vcpu);
265                 r = __mmu_topup_memory_caches(vcpu, GFP_KERNEL);
266                 kvm_arch_ops->vcpu_load(vcpu);
267                 spin_lock(&vcpu->kvm->lock);
268         }
269         return r;
270 }
271
272 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
273 {
274         mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache);
275         mmu_free_memory_cache(&vcpu->mmu_rmap_desc_cache);
276         mmu_free_memory_cache(&vcpu->mmu_page_cache);
277         mmu_free_memory_cache(&vcpu->mmu_page_header_cache);
278 }
279
280 static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
281                                     size_t size)
282 {
283         void *p;
284
285         BUG_ON(!mc->nobjs);
286         p = mc->objects[--mc->nobjs];
287         memset(p, 0, size);
288         return p;
289 }
290
291 static void mmu_memory_cache_free(struct kvm_mmu_memory_cache *mc, void *obj)
292 {
293         if (mc->nobjs < KVM_NR_MEM_OBJS)
294                 mc->objects[mc->nobjs++] = obj;
295         else
296                 kfree(obj);
297 }
298
299 static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
300 {
301         return mmu_memory_cache_alloc(&vcpu->mmu_pte_chain_cache,
302                                       sizeof(struct kvm_pte_chain));
303 }
304
305 static void mmu_free_pte_chain(struct kvm_vcpu *vcpu,
306                                struct kvm_pte_chain *pc)
307 {
308         mmu_memory_cache_free(&vcpu->mmu_pte_chain_cache, pc);
309 }
310
311 static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
312 {
313         return mmu_memory_cache_alloc(&vcpu->mmu_rmap_desc_cache,
314                                       sizeof(struct kvm_rmap_desc));
315 }
316
317 static void mmu_free_rmap_desc(struct kvm_vcpu *vcpu,
318                                struct kvm_rmap_desc *rd)
319 {
320         mmu_memory_cache_free(&vcpu->mmu_rmap_desc_cache, rd);
321 }
322
323 /*
324  * Reverse mapping data structures:
325  *
326  * If page->private bit zero is zero, then page->private points to the
327  * shadow page table entry that points to page_address(page).
328  *
329  * If page->private bit zero is one, (then page->private & ~1) points
330  * to a struct kvm_rmap_desc containing more mappings.
331  */
332 static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte)
333 {
334         struct page *page;
335         struct kvm_rmap_desc *desc;
336         int i;
337
338         if (!is_rmap_pte(*spte))
339                 return;
340         page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
341         if (!page_private(page)) {
342                 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
343                 set_page_private(page,(unsigned long)spte);
344         } else if (!(page_private(page) & 1)) {
345                 rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
346                 desc = mmu_alloc_rmap_desc(vcpu);
347                 desc->shadow_ptes[0] = (u64 *)page_private(page);
348                 desc->shadow_ptes[1] = spte;
349                 set_page_private(page,(unsigned long)desc | 1);
350         } else {
351                 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
352                 desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
353                 while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
354                         desc = desc->more;
355                 if (desc->shadow_ptes[RMAP_EXT-1]) {
356                         desc->more = mmu_alloc_rmap_desc(vcpu);
357                         desc = desc->more;
358                 }
359                 for (i = 0; desc->shadow_ptes[i]; ++i)
360                         ;
361                 desc->shadow_ptes[i] = spte;
362         }
363 }
364
365 static void rmap_desc_remove_entry(struct kvm_vcpu *vcpu,
366                                    struct page *page,
367                                    struct kvm_rmap_desc *desc,
368                                    int i,
369                                    struct kvm_rmap_desc *prev_desc)
370 {
371         int j;
372
373         for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
374                 ;
375         desc->shadow_ptes[i] = desc->shadow_ptes[j];
376         desc->shadow_ptes[j] = NULL;
377         if (j != 0)
378                 return;
379         if (!prev_desc && !desc->more)
380                 set_page_private(page,(unsigned long)desc->shadow_ptes[0]);
381         else
382                 if (prev_desc)
383                         prev_desc->more = desc->more;
384                 else
385                         set_page_private(page,(unsigned long)desc->more | 1);
386         mmu_free_rmap_desc(vcpu, desc);
387 }
388
389 static void rmap_remove(struct kvm_vcpu *vcpu, u64 *spte)
390 {
391         struct page *page;
392         struct kvm_rmap_desc *desc;
393         struct kvm_rmap_desc *prev_desc;
394         int i;
395
396         if (!is_rmap_pte(*spte))
397                 return;
398         page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
399         if (!page_private(page)) {
400                 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
401                 BUG();
402         } else if (!(page_private(page) & 1)) {
403                 rmap_printk("rmap_remove:  %p %llx 1->0\n", spte, *spte);
404                 if ((u64 *)page_private(page) != spte) {
405                         printk(KERN_ERR "rmap_remove:  %p %llx 1->BUG\n",
406                                spte, *spte);
407                         BUG();
408                 }
409                 set_page_private(page,0);
410         } else {
411                 rmap_printk("rmap_remove:  %p %llx many->many\n", spte, *spte);
412                 desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
413                 prev_desc = NULL;
414                 while (desc) {
415                         for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
416                                 if (desc->shadow_ptes[i] == spte) {
417                                         rmap_desc_remove_entry(vcpu, page,
418                                                                desc, i,
419                                                                prev_desc);
420                                         return;
421                                 }
422                         prev_desc = desc;
423                         desc = desc->more;
424                 }
425                 BUG();
426         }
427 }
428
429 static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
430 {
431         struct kvm *kvm = vcpu->kvm;
432         struct page *page;
433         struct kvm_rmap_desc *desc;
434         u64 *spte;
435
436         page = gfn_to_page(kvm, gfn);
437         BUG_ON(!page);
438
439         while (page_private(page)) {
440                 if (!(page_private(page) & 1))
441                         spte = (u64 *)page_private(page);
442                 else {
443                         desc = (struct kvm_rmap_desc *)(page_private(page) & ~1ul);
444                         spte = desc->shadow_ptes[0];
445                 }
446                 BUG_ON(!spte);
447                 BUG_ON((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT
448                        != page_to_pfn(page));
449                 BUG_ON(!(*spte & PT_PRESENT_MASK));
450                 BUG_ON(!(*spte & PT_WRITABLE_MASK));
451                 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
452                 rmap_remove(vcpu, spte);
453                 kvm_arch_ops->tlb_flush(vcpu);
454                 set_shadow_pte(spte, *spte & ~PT_WRITABLE_MASK);
455         }
456 }
457
458 #ifdef MMU_DEBUG
459 static int is_empty_shadow_page(u64 *spt)
460 {
461         u64 *pos;
462         u64 *end;
463
464         for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
465                 if (*pos != 0) {
466                         printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
467                                pos, *pos);
468                         return 0;
469                 }
470         return 1;
471 }
472 #endif
473
474 static void kvm_mmu_free_page(struct kvm_vcpu *vcpu,
475                               struct kvm_mmu_page *page_head)
476 {
477         ASSERT(is_empty_shadow_page(page_head->spt));
478         list_del(&page_head->link);
479         mmu_memory_cache_free(&vcpu->mmu_page_cache, page_head->spt);
480         mmu_memory_cache_free(&vcpu->mmu_page_header_cache, page_head);
481         ++vcpu->kvm->n_free_mmu_pages;
482 }
483
484 static unsigned kvm_page_table_hashfn(gfn_t gfn)
485 {
486         return gfn;
487 }
488
489 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
490                                                u64 *parent_pte)
491 {
492         struct kvm_mmu_page *page;
493
494         if (!vcpu->kvm->n_free_mmu_pages)
495                 return NULL;
496
497         page = mmu_memory_cache_alloc(&vcpu->mmu_page_header_cache,
498                                       sizeof *page);
499         page->spt = mmu_memory_cache_alloc(&vcpu->mmu_page_cache, PAGE_SIZE);
500         set_page_private(virt_to_page(page->spt), (unsigned long)page);
501         list_add(&page->link, &vcpu->kvm->active_mmu_pages);
502         ASSERT(is_empty_shadow_page(page->spt));
503         page->slot_bitmap = 0;
504         page->multimapped = 0;
505         page->parent_pte = parent_pte;
506         --vcpu->kvm->n_free_mmu_pages;
507         return page;
508 }
509
510 static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
511                                     struct kvm_mmu_page *page, u64 *parent_pte)
512 {
513         struct kvm_pte_chain *pte_chain;
514         struct hlist_node *node;
515         int i;
516
517         if (!parent_pte)
518                 return;
519         if (!page->multimapped) {
520                 u64 *old = page->parent_pte;
521
522                 if (!old) {
523                         page->parent_pte = parent_pte;
524                         return;
525                 }
526                 page->multimapped = 1;
527                 pte_chain = mmu_alloc_pte_chain(vcpu);
528                 INIT_HLIST_HEAD(&page->parent_ptes);
529                 hlist_add_head(&pte_chain->link, &page->parent_ptes);
530                 pte_chain->parent_ptes[0] = old;
531         }
532         hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link) {
533                 if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
534                         continue;
535                 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
536                         if (!pte_chain->parent_ptes[i]) {
537                                 pte_chain->parent_ptes[i] = parent_pte;
538                                 return;
539                         }
540         }
541         pte_chain = mmu_alloc_pte_chain(vcpu);
542         BUG_ON(!pte_chain);
543         hlist_add_head(&pte_chain->link, &page->parent_ptes);
544         pte_chain->parent_ptes[0] = parent_pte;
545 }
546
547 static void mmu_page_remove_parent_pte(struct kvm_vcpu *vcpu,
548                                        struct kvm_mmu_page *page,
549                                        u64 *parent_pte)
550 {
551         struct kvm_pte_chain *pte_chain;
552         struct hlist_node *node;
553         int i;
554
555         if (!page->multimapped) {
556                 BUG_ON(page->parent_pte != parent_pte);
557                 page->parent_pte = NULL;
558                 return;
559         }
560         hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link)
561                 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
562                         if (!pte_chain->parent_ptes[i])
563                                 break;
564                         if (pte_chain->parent_ptes[i] != parent_pte)
565                                 continue;
566                         while (i + 1 < NR_PTE_CHAIN_ENTRIES
567                                 && pte_chain->parent_ptes[i + 1]) {
568                                 pte_chain->parent_ptes[i]
569                                         = pte_chain->parent_ptes[i + 1];
570                                 ++i;
571                         }
572                         pte_chain->parent_ptes[i] = NULL;
573                         if (i == 0) {
574                                 hlist_del(&pte_chain->link);
575                                 mmu_free_pte_chain(vcpu, pte_chain);
576                                 if (hlist_empty(&page->parent_ptes)) {
577                                         page->multimapped = 0;
578                                         page->parent_pte = NULL;
579                                 }
580                         }
581                         return;
582                 }
583         BUG();
584 }
585
586 static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm_vcpu *vcpu,
587                                                 gfn_t gfn)
588 {
589         unsigned index;
590         struct hlist_head *bucket;
591         struct kvm_mmu_page *page;
592         struct hlist_node *node;
593
594         pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
595         index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
596         bucket = &vcpu->kvm->mmu_page_hash[index];
597         hlist_for_each_entry(page, node, bucket, hash_link)
598                 if (page->gfn == gfn && !page->role.metaphysical) {
599                         pgprintk("%s: found role %x\n",
600                                  __FUNCTION__, page->role.word);
601                         return page;
602                 }
603         return NULL;
604 }
605
606 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
607                                              gfn_t gfn,
608                                              gva_t gaddr,
609                                              unsigned level,
610                                              int metaphysical,
611                                              unsigned hugepage_access,
612                                              u64 *parent_pte)
613 {
614         union kvm_mmu_page_role role;
615         unsigned index;
616         unsigned quadrant;
617         struct hlist_head *bucket;
618         struct kvm_mmu_page *page;
619         struct hlist_node *node;
620
621         role.word = 0;
622         role.glevels = vcpu->mmu.root_level;
623         role.level = level;
624         role.metaphysical = metaphysical;
625         role.hugepage_access = hugepage_access;
626         if (vcpu->mmu.root_level <= PT32_ROOT_LEVEL) {
627                 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
628                 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
629                 role.quadrant = quadrant;
630         }
631         pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__,
632                  gfn, role.word);
633         index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
634         bucket = &vcpu->kvm->mmu_page_hash[index];
635         hlist_for_each_entry(page, node, bucket, hash_link)
636                 if (page->gfn == gfn && page->role.word == role.word) {
637                         mmu_page_add_parent_pte(vcpu, page, parent_pte);
638                         pgprintk("%s: found\n", __FUNCTION__);
639                         return page;
640                 }
641         page = kvm_mmu_alloc_page(vcpu, parent_pte);
642         if (!page)
643                 return page;
644         pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word);
645         page->gfn = gfn;
646         page->role = role;
647         hlist_add_head(&page->hash_link, bucket);
648         if (!metaphysical)
649                 rmap_write_protect(vcpu, gfn);
650         return page;
651 }
652
653 static void kvm_mmu_page_unlink_children(struct kvm_vcpu *vcpu,
654                                          struct kvm_mmu_page *page)
655 {
656         unsigned i;
657         u64 *pt;
658         u64 ent;
659
660         pt = page->spt;
661
662         if (page->role.level == PT_PAGE_TABLE_LEVEL) {
663                 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
664                         if (pt[i] & PT_PRESENT_MASK)
665                                 rmap_remove(vcpu, &pt[i]);
666                         pt[i] = 0;
667                 }
668                 kvm_arch_ops->tlb_flush(vcpu);
669                 return;
670         }
671
672         for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
673                 ent = pt[i];
674
675                 pt[i] = 0;
676                 if (!(ent & PT_PRESENT_MASK))
677                         continue;
678                 ent &= PT64_BASE_ADDR_MASK;
679                 mmu_page_remove_parent_pte(vcpu, page_header(ent), &pt[i]);
680         }
681 }
682
683 static void kvm_mmu_put_page(struct kvm_vcpu *vcpu,
684                              struct kvm_mmu_page *page,
685                              u64 *parent_pte)
686 {
687         mmu_page_remove_parent_pte(vcpu, page, parent_pte);
688 }
689
690 static void kvm_mmu_zap_page(struct kvm_vcpu *vcpu,
691                              struct kvm_mmu_page *page)
692 {
693         u64 *parent_pte;
694
695         while (page->multimapped || page->parent_pte) {
696                 if (!page->multimapped)
697                         parent_pte = page->parent_pte;
698                 else {
699                         struct kvm_pte_chain *chain;
700
701                         chain = container_of(page->parent_ptes.first,
702                                              struct kvm_pte_chain, link);
703                         parent_pte = chain->parent_ptes[0];
704                 }
705                 BUG_ON(!parent_pte);
706                 kvm_mmu_put_page(vcpu, page, parent_pte);
707                 set_shadow_pte(parent_pte, 0);
708         }
709         kvm_mmu_page_unlink_children(vcpu, page);
710         if (!page->root_count) {
711                 hlist_del(&page->hash_link);
712                 kvm_mmu_free_page(vcpu, page);
713         } else
714                 list_move(&page->link, &vcpu->kvm->active_mmu_pages);
715 }
716
717 static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn)
718 {
719         unsigned index;
720         struct hlist_head *bucket;
721         struct kvm_mmu_page *page;
722         struct hlist_node *node, *n;
723         int r;
724
725         pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
726         r = 0;
727         index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
728         bucket = &vcpu->kvm->mmu_page_hash[index];
729         hlist_for_each_entry_safe(page, node, n, bucket, hash_link)
730                 if (page->gfn == gfn && !page->role.metaphysical) {
731                         pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn,
732                                  page->role.word);
733                         kvm_mmu_zap_page(vcpu, page);
734                         r = 1;
735                 }
736         return r;
737 }
738
739 static void mmu_unshadow(struct kvm_vcpu *vcpu, gfn_t gfn)
740 {
741         struct kvm_mmu_page *page;
742
743         while ((page = kvm_mmu_lookup_page(vcpu, gfn)) != NULL) {
744                 pgprintk("%s: zap %lx %x\n",
745                          __FUNCTION__, gfn, page->role.word);
746                 kvm_mmu_zap_page(vcpu, page);
747         }
748 }
749
750 static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa)
751 {
752         int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT));
753         struct kvm_mmu_page *page_head = page_header(__pa(pte));
754
755         __set_bit(slot, &page_head->slot_bitmap);
756 }
757
758 hpa_t safe_gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
759 {
760         hpa_t hpa = gpa_to_hpa(vcpu, gpa);
761
762         return is_error_hpa(hpa) ? bad_page_address | (gpa & ~PAGE_MASK): hpa;
763 }
764
765 hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
766 {
767         struct page *page;
768
769         ASSERT((gpa & HPA_ERR_MASK) == 0);
770         page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
771         if (!page)
772                 return gpa | HPA_ERR_MASK;
773         return ((hpa_t)page_to_pfn(page) << PAGE_SHIFT)
774                 | (gpa & (PAGE_SIZE-1));
775 }
776
777 hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva)
778 {
779         gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
780
781         if (gpa == UNMAPPED_GVA)
782                 return UNMAPPED_GVA;
783         return gpa_to_hpa(vcpu, gpa);
784 }
785
786 struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
787 {
788         gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
789
790         if (gpa == UNMAPPED_GVA)
791                 return NULL;
792         return pfn_to_page(gpa_to_hpa(vcpu, gpa) >> PAGE_SHIFT);
793 }
794
795 static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
796 {
797 }
798
799 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p)
800 {
801         int level = PT32E_ROOT_LEVEL;
802         hpa_t table_addr = vcpu->mmu.root_hpa;
803
804         for (; ; level--) {
805                 u32 index = PT64_INDEX(v, level);
806                 u64 *table;
807                 u64 pte;
808
809                 ASSERT(VALID_PAGE(table_addr));
810                 table = __va(table_addr);
811
812                 if (level == 1) {
813                         pte = table[index];
814                         if (is_present_pte(pte) && is_writeble_pte(pte))
815                                 return 0;
816                         mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT);
817                         page_header_update_slot(vcpu->kvm, table, v);
818                         table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK |
819                                                                 PT_USER_MASK;
820                         rmap_add(vcpu, &table[index]);
821                         return 0;
822                 }
823
824                 if (table[index] == 0) {
825                         struct kvm_mmu_page *new_table;
826                         gfn_t pseudo_gfn;
827
828                         pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
829                                 >> PAGE_SHIFT;
830                         new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
831                                                      v, level - 1,
832                                                      1, 0, &table[index]);
833                         if (!new_table) {
834                                 pgprintk("nonpaging_map: ENOMEM\n");
835                                 return -ENOMEM;
836                         }
837
838                         table[index] = __pa(new_table->spt) | PT_PRESENT_MASK
839                                 | PT_WRITABLE_MASK | PT_USER_MASK;
840                 }
841                 table_addr = table[index] & PT64_BASE_ADDR_MASK;
842         }
843 }
844
845 static void mmu_free_roots(struct kvm_vcpu *vcpu)
846 {
847         int i;
848         struct kvm_mmu_page *page;
849
850 #ifdef CONFIG_X86_64
851         if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
852                 hpa_t root = vcpu->mmu.root_hpa;
853
854                 ASSERT(VALID_PAGE(root));
855                 page = page_header(root);
856                 --page->root_count;
857                 vcpu->mmu.root_hpa = INVALID_PAGE;
858                 return;
859         }
860 #endif
861         for (i = 0; i < 4; ++i) {
862                 hpa_t root = vcpu->mmu.pae_root[i];
863
864                 if (root) {
865                         ASSERT(VALID_PAGE(root));
866                         root &= PT64_BASE_ADDR_MASK;
867                         page = page_header(root);
868                         --page->root_count;
869                 }
870                 vcpu->mmu.pae_root[i] = INVALID_PAGE;
871         }
872         vcpu->mmu.root_hpa = INVALID_PAGE;
873 }
874
875 static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
876 {
877         int i;
878         gfn_t root_gfn;
879         struct kvm_mmu_page *page;
880
881         root_gfn = vcpu->cr3 >> PAGE_SHIFT;
882
883 #ifdef CONFIG_X86_64
884         if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
885                 hpa_t root = vcpu->mmu.root_hpa;
886
887                 ASSERT(!VALID_PAGE(root));
888                 page = kvm_mmu_get_page(vcpu, root_gfn, 0,
889                                         PT64_ROOT_LEVEL, 0, 0, NULL);
890                 root = __pa(page->spt);
891                 ++page->root_count;
892                 vcpu->mmu.root_hpa = root;
893                 return;
894         }
895 #endif
896         for (i = 0; i < 4; ++i) {
897                 hpa_t root = vcpu->mmu.pae_root[i];
898
899                 ASSERT(!VALID_PAGE(root));
900                 if (vcpu->mmu.root_level == PT32E_ROOT_LEVEL) {
901                         if (!is_present_pte(vcpu->pdptrs[i])) {
902                                 vcpu->mmu.pae_root[i] = 0;
903                                 continue;
904                         }
905                         root_gfn = vcpu->pdptrs[i] >> PAGE_SHIFT;
906                 } else if (vcpu->mmu.root_level == 0)
907                         root_gfn = 0;
908                 page = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
909                                         PT32_ROOT_LEVEL, !is_paging(vcpu),
910                                         0, NULL);
911                 root = __pa(page->spt);
912                 ++page->root_count;
913                 vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK;
914         }
915         vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root);
916 }
917
918 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
919 {
920         return vaddr;
921 }
922
923 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
924                                u32 error_code)
925 {
926         gpa_t addr = gva;
927         hpa_t paddr;
928         int r;
929
930         r = mmu_topup_memory_caches(vcpu);
931         if (r)
932                 return r;
933
934         ASSERT(vcpu);
935         ASSERT(VALID_PAGE(vcpu->mmu.root_hpa));
936
937
938         paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK);
939
940         if (is_error_hpa(paddr))
941                 return 1;
942
943         return nonpaging_map(vcpu, addr & PAGE_MASK, paddr);
944 }
945
946 static void nonpaging_free(struct kvm_vcpu *vcpu)
947 {
948         mmu_free_roots(vcpu);
949 }
950
951 static int nonpaging_init_context(struct kvm_vcpu *vcpu)
952 {
953         struct kvm_mmu *context = &vcpu->mmu;
954
955         context->new_cr3 = nonpaging_new_cr3;
956         context->page_fault = nonpaging_page_fault;
957         context->gva_to_gpa = nonpaging_gva_to_gpa;
958         context->free = nonpaging_free;
959         context->root_level = 0;
960         context->shadow_root_level = PT32E_ROOT_LEVEL;
961         mmu_alloc_roots(vcpu);
962         ASSERT(VALID_PAGE(context->root_hpa));
963         kvm_arch_ops->set_cr3(vcpu, context->root_hpa);
964         return 0;
965 }
966
967 static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
968 {
969         ++vcpu->stat.tlb_flush;
970         kvm_arch_ops->tlb_flush(vcpu);
971 }
972
973 static void paging_new_cr3(struct kvm_vcpu *vcpu)
974 {
975         pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3);
976         mmu_free_roots(vcpu);
977         if (unlikely(vcpu->kvm->n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
978                 kvm_mmu_free_some_pages(vcpu);
979         mmu_alloc_roots(vcpu);
980         kvm_mmu_flush_tlb(vcpu);
981         kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa);
982 }
983
984 static void inject_page_fault(struct kvm_vcpu *vcpu,
985                               u64 addr,
986                               u32 err_code)
987 {
988         kvm_arch_ops->inject_page_fault(vcpu, addr, err_code);
989 }
990
991 static void paging_free(struct kvm_vcpu *vcpu)
992 {
993         nonpaging_free(vcpu);
994 }
995
996 #define PTTYPE 64
997 #include "paging_tmpl.h"
998 #undef PTTYPE
999
1000 #define PTTYPE 32
1001 #include "paging_tmpl.h"
1002 #undef PTTYPE
1003
1004 static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
1005 {
1006         struct kvm_mmu *context = &vcpu->mmu;
1007
1008         ASSERT(is_pae(vcpu));
1009         context->new_cr3 = paging_new_cr3;
1010         context->page_fault = paging64_page_fault;
1011         context->gva_to_gpa = paging64_gva_to_gpa;
1012         context->free = paging_free;
1013         context->root_level = level;
1014         context->shadow_root_level = level;
1015         mmu_alloc_roots(vcpu);
1016         ASSERT(VALID_PAGE(context->root_hpa));
1017         kvm_arch_ops->set_cr3(vcpu, context->root_hpa |
1018                     (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK)));
1019         return 0;
1020 }
1021
1022 static int paging64_init_context(struct kvm_vcpu *vcpu)
1023 {
1024         return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
1025 }
1026
1027 static int paging32_init_context(struct kvm_vcpu *vcpu)
1028 {
1029         struct kvm_mmu *context = &vcpu->mmu;
1030
1031         context->new_cr3 = paging_new_cr3;
1032         context->page_fault = paging32_page_fault;
1033         context->gva_to_gpa = paging32_gva_to_gpa;
1034         context->free = paging_free;
1035         context->root_level = PT32_ROOT_LEVEL;
1036         context->shadow_root_level = PT32E_ROOT_LEVEL;
1037         mmu_alloc_roots(vcpu);
1038         ASSERT(VALID_PAGE(context->root_hpa));
1039         kvm_arch_ops->set_cr3(vcpu, context->root_hpa |
1040                     (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK)));
1041         return 0;
1042 }
1043
1044 static int paging32E_init_context(struct kvm_vcpu *vcpu)
1045 {
1046         return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
1047 }
1048
1049 static int init_kvm_mmu(struct kvm_vcpu *vcpu)
1050 {
1051         ASSERT(vcpu);
1052         ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
1053
1054         mmu_topup_memory_caches(vcpu);
1055         if (!is_paging(vcpu))
1056                 return nonpaging_init_context(vcpu);
1057         else if (is_long_mode(vcpu))
1058                 return paging64_init_context(vcpu);
1059         else if (is_pae(vcpu))
1060                 return paging32E_init_context(vcpu);
1061         else
1062                 return paging32_init_context(vcpu);
1063 }
1064
1065 static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
1066 {
1067         ASSERT(vcpu);
1068         if (VALID_PAGE(vcpu->mmu.root_hpa)) {
1069                 vcpu->mmu.free(vcpu);
1070                 vcpu->mmu.root_hpa = INVALID_PAGE;
1071         }
1072 }
1073
1074 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
1075 {
1076         int r;
1077
1078         destroy_kvm_mmu(vcpu);
1079         r = init_kvm_mmu(vcpu);
1080         if (r < 0)
1081                 goto out;
1082         r = mmu_topup_memory_caches(vcpu);
1083 out:
1084         return r;
1085 }
1086
1087 static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
1088                                   struct kvm_mmu_page *page,
1089                                   u64 *spte)
1090 {
1091         u64 pte;
1092         struct kvm_mmu_page *child;
1093
1094         pte = *spte;
1095         if (is_present_pte(pte)) {
1096                 if (page->role.level == PT_PAGE_TABLE_LEVEL)
1097                         rmap_remove(vcpu, spte);
1098                 else {
1099                         child = page_header(pte & PT64_BASE_ADDR_MASK);
1100                         mmu_page_remove_parent_pte(vcpu, child, spte);
1101                 }
1102         }
1103         *spte = 0;
1104 }
1105
1106 static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
1107                                   struct kvm_mmu_page *page,
1108                                   u64 *spte,
1109                                   const void *new, int bytes)
1110 {
1111         if (page->role.level != PT_PAGE_TABLE_LEVEL)
1112                 return;
1113
1114         if (page->role.glevels == PT32_ROOT_LEVEL)
1115                 paging32_update_pte(vcpu, page, spte, new, bytes);
1116         else
1117                 paging64_update_pte(vcpu, page, spte, new, bytes);
1118 }
1119
1120 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1121                        const u8 *old, const u8 *new, int bytes)
1122 {
1123         gfn_t gfn = gpa >> PAGE_SHIFT;
1124         struct kvm_mmu_page *page;
1125         struct hlist_node *node, *n;
1126         struct hlist_head *bucket;
1127         unsigned index;
1128         u64 *spte;
1129         unsigned offset = offset_in_page(gpa);
1130         unsigned pte_size;
1131         unsigned page_offset;
1132         unsigned misaligned;
1133         unsigned quadrant;
1134         int level;
1135         int flooded = 0;
1136         int npte;
1137
1138         pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
1139         if (gfn == vcpu->last_pt_write_gfn) {
1140                 ++vcpu->last_pt_write_count;
1141                 if (vcpu->last_pt_write_count >= 3)
1142                         flooded = 1;
1143         } else {
1144                 vcpu->last_pt_write_gfn = gfn;
1145                 vcpu->last_pt_write_count = 1;
1146         }
1147         index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
1148         bucket = &vcpu->kvm->mmu_page_hash[index];
1149         hlist_for_each_entry_safe(page, node, n, bucket, hash_link) {
1150                 if (page->gfn != gfn || page->role.metaphysical)
1151                         continue;
1152                 pte_size = page->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
1153                 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
1154                 misaligned |= bytes < 4;
1155                 if (misaligned || flooded) {
1156                         /*
1157                          * Misaligned accesses are too much trouble to fix
1158                          * up; also, they usually indicate a page is not used
1159                          * as a page table.
1160                          *
1161                          * If we're seeing too many writes to a page,
1162                          * it may no longer be a page table, or we may be
1163                          * forking, in which case it is better to unmap the
1164                          * page.
1165                          */
1166                         pgprintk("misaligned: gpa %llx bytes %d role %x\n",
1167                                  gpa, bytes, page->role.word);
1168                         kvm_mmu_zap_page(vcpu, page);
1169                         continue;
1170                 }
1171                 page_offset = offset;
1172                 level = page->role.level;
1173                 npte = 1;
1174                 if (page->role.glevels == PT32_ROOT_LEVEL) {
1175                         page_offset <<= 1;      /* 32->64 */
1176                         /*
1177                          * A 32-bit pde maps 4MB while the shadow pdes map
1178                          * only 2MB.  So we need to double the offset again
1179                          * and zap two pdes instead of one.
1180                          */
1181                         if (level == PT32_ROOT_LEVEL) {
1182                                 page_offset &= ~7; /* kill rounding error */
1183                                 page_offset <<= 1;
1184                                 npte = 2;
1185                         }
1186                         quadrant = page_offset >> PAGE_SHIFT;
1187                         page_offset &= ~PAGE_MASK;
1188                         if (quadrant != page->role.quadrant)
1189                                 continue;
1190                 }
1191                 spte = &page->spt[page_offset / sizeof(*spte)];
1192                 while (npte--) {
1193                         mmu_pte_write_zap_pte(vcpu, page, spte);
1194                         mmu_pte_write_new_pte(vcpu, page, spte, new, bytes);
1195                         ++spte;
1196                 }
1197         }
1198 }
1199
1200 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
1201 {
1202         gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
1203
1204         return kvm_mmu_unprotect_page(vcpu, gpa >> PAGE_SHIFT);
1205 }
1206
1207 void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
1208 {
1209         while (vcpu->kvm->n_free_mmu_pages < KVM_REFILL_PAGES) {
1210                 struct kvm_mmu_page *page;
1211
1212                 page = container_of(vcpu->kvm->active_mmu_pages.prev,
1213                                     struct kvm_mmu_page, link);
1214                 kvm_mmu_zap_page(vcpu, page);
1215         }
1216 }
1217 EXPORT_SYMBOL_GPL(kvm_mmu_free_some_pages);
1218
1219 static void free_mmu_pages(struct kvm_vcpu *vcpu)
1220 {
1221         struct kvm_mmu_page *page;
1222
1223         while (!list_empty(&vcpu->kvm->active_mmu_pages)) {
1224                 page = container_of(vcpu->kvm->active_mmu_pages.next,
1225                                     struct kvm_mmu_page, link);
1226                 kvm_mmu_zap_page(vcpu, page);
1227         }
1228         free_page((unsigned long)vcpu->mmu.pae_root);
1229 }
1230
1231 static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
1232 {
1233         struct page *page;
1234         int i;
1235
1236         ASSERT(vcpu);
1237
1238         vcpu->kvm->n_free_mmu_pages = KVM_NUM_MMU_PAGES;
1239
1240         /*
1241          * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
1242          * Therefore we need to allocate shadow page tables in the first
1243          * 4GB of memory, which happens to fit the DMA32 zone.
1244          */
1245         page = alloc_page(GFP_KERNEL | __GFP_DMA32);
1246         if (!page)
1247                 goto error_1;
1248         vcpu->mmu.pae_root = page_address(page);
1249         for (i = 0; i < 4; ++i)
1250                 vcpu->mmu.pae_root[i] = INVALID_PAGE;
1251
1252         return 0;
1253
1254 error_1:
1255         free_mmu_pages(vcpu);
1256         return -ENOMEM;
1257 }
1258
1259 int kvm_mmu_create(struct kvm_vcpu *vcpu)
1260 {
1261         ASSERT(vcpu);
1262         ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
1263
1264         return alloc_mmu_pages(vcpu);
1265 }
1266
1267 int kvm_mmu_setup(struct kvm_vcpu *vcpu)
1268 {
1269         ASSERT(vcpu);
1270         ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
1271
1272         return init_kvm_mmu(vcpu);
1273 }
1274
1275 void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
1276 {
1277         ASSERT(vcpu);
1278
1279         destroy_kvm_mmu(vcpu);
1280         free_mmu_pages(vcpu);
1281         mmu_free_memory_caches(vcpu);
1282 }
1283
1284 void kvm_mmu_slot_remove_write_access(struct kvm_vcpu *vcpu, int slot)
1285 {
1286         struct kvm *kvm = vcpu->kvm;
1287         struct kvm_mmu_page *page;
1288
1289         list_for_each_entry(page, &kvm->active_mmu_pages, link) {
1290                 int i;
1291                 u64 *pt;
1292
1293                 if (!test_bit(slot, &page->slot_bitmap))
1294                         continue;
1295
1296                 pt = page->spt;
1297                 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
1298                         /* avoid RMW */
1299                         if (pt[i] & PT_WRITABLE_MASK) {
1300                                 rmap_remove(vcpu, &pt[i]);
1301                                 pt[i] &= ~PT_WRITABLE_MASK;
1302                         }
1303         }
1304 }
1305
1306 void kvm_mmu_zap_all(struct kvm_vcpu *vcpu)
1307 {
1308         destroy_kvm_mmu(vcpu);
1309
1310         while (!list_empty(&vcpu->kvm->active_mmu_pages)) {
1311                 struct kvm_mmu_page *page;
1312
1313                 page = container_of(vcpu->kvm->active_mmu_pages.next,
1314                                     struct kvm_mmu_page, link);
1315                 kvm_mmu_zap_page(vcpu, page);
1316         }
1317
1318         mmu_free_memory_caches(vcpu);
1319         kvm_arch_ops->tlb_flush(vcpu);
1320         init_kvm_mmu(vcpu);
1321 }
1322
1323 void kvm_mmu_module_exit(void)
1324 {
1325         if (pte_chain_cache)
1326                 kmem_cache_destroy(pte_chain_cache);
1327         if (rmap_desc_cache)
1328                 kmem_cache_destroy(rmap_desc_cache);
1329         if (mmu_page_cache)
1330                 kmem_cache_destroy(mmu_page_cache);
1331         if (mmu_page_header_cache)
1332                 kmem_cache_destroy(mmu_page_header_cache);
1333 }
1334
1335 int kvm_mmu_module_init(void)
1336 {
1337         pte_chain_cache = kmem_cache_create("kvm_pte_chain",
1338                                             sizeof(struct kvm_pte_chain),
1339                                             0, 0, NULL, NULL);
1340         if (!pte_chain_cache)
1341                 goto nomem;
1342         rmap_desc_cache = kmem_cache_create("kvm_rmap_desc",
1343                                             sizeof(struct kvm_rmap_desc),
1344                                             0, 0, NULL, NULL);
1345         if (!rmap_desc_cache)
1346                 goto nomem;
1347
1348         mmu_page_cache = kmem_cache_create("kvm_mmu_page",
1349                                            PAGE_SIZE,
1350                                            PAGE_SIZE, 0, NULL, NULL);
1351         if (!mmu_page_cache)
1352                 goto nomem;
1353
1354         mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
1355                                                   sizeof(struct kvm_mmu_page),
1356                                                   0, 0, NULL, NULL);
1357         if (!mmu_page_header_cache)
1358                 goto nomem;
1359
1360         return 0;
1361
1362 nomem:
1363         kvm_mmu_module_exit();
1364         return -ENOMEM;
1365 }
1366
1367 #ifdef AUDIT
1368
1369 static const char *audit_msg;
1370
1371 static gva_t canonicalize(gva_t gva)
1372 {
1373 #ifdef CONFIG_X86_64
1374         gva = (long long)(gva << 16) >> 16;
1375 #endif
1376         return gva;
1377 }
1378
1379 static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
1380                                 gva_t va, int level)
1381 {
1382         u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
1383         int i;
1384         gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
1385
1386         for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
1387                 u64 ent = pt[i];
1388
1389                 if (!(ent & PT_PRESENT_MASK))
1390                         continue;
1391
1392                 va = canonicalize(va);
1393                 if (level > 1)
1394                         audit_mappings_page(vcpu, ent, va, level - 1);
1395                 else {
1396                         gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, va);
1397                         hpa_t hpa = gpa_to_hpa(vcpu, gpa);
1398
1399                         if ((ent & PT_PRESENT_MASK)
1400                             && (ent & PT64_BASE_ADDR_MASK) != hpa)
1401                                 printk(KERN_ERR "audit error: (%s) levels %d"
1402                                        " gva %lx gpa %llx hpa %llx ent %llx\n",
1403                                        audit_msg, vcpu->mmu.root_level,
1404                                        va, gpa, hpa, ent);
1405                 }
1406         }
1407 }
1408
1409 static void audit_mappings(struct kvm_vcpu *vcpu)
1410 {
1411         unsigned i;
1412
1413         if (vcpu->mmu.root_level == 4)
1414                 audit_mappings_page(vcpu, vcpu->mmu.root_hpa, 0, 4);
1415         else
1416                 for (i = 0; i < 4; ++i)
1417                         if (vcpu->mmu.pae_root[i] & PT_PRESENT_MASK)
1418                                 audit_mappings_page(vcpu,
1419                                                     vcpu->mmu.pae_root[i],
1420                                                     i << 30,
1421                                                     2);
1422 }
1423
1424 static int count_rmaps(struct kvm_vcpu *vcpu)
1425 {
1426         int nmaps = 0;
1427         int i, j, k;
1428
1429         for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
1430                 struct kvm_memory_slot *m = &vcpu->kvm->memslots[i];
1431                 struct kvm_rmap_desc *d;
1432
1433                 for (j = 0; j < m->npages; ++j) {
1434                         struct page *page = m->phys_mem[j];
1435
1436                         if (!page->private)
1437                                 continue;
1438                         if (!(page->private & 1)) {
1439                                 ++nmaps;
1440                                 continue;
1441                         }
1442                         d = (struct kvm_rmap_desc *)(page->private & ~1ul);
1443                         while (d) {
1444                                 for (k = 0; k < RMAP_EXT; ++k)
1445                                         if (d->shadow_ptes[k])
1446                                                 ++nmaps;
1447                                         else
1448                                                 break;
1449                                 d = d->more;
1450                         }
1451                 }
1452         }
1453         return nmaps;
1454 }
1455
1456 static int count_writable_mappings(struct kvm_vcpu *vcpu)
1457 {
1458         int nmaps = 0;
1459         struct kvm_mmu_page *page;
1460         int i;
1461
1462         list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
1463                 u64 *pt = page->spt;
1464
1465                 if (page->role.level != PT_PAGE_TABLE_LEVEL)
1466                         continue;
1467
1468                 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1469                         u64 ent = pt[i];
1470
1471                         if (!(ent & PT_PRESENT_MASK))
1472                                 continue;
1473                         if (!(ent & PT_WRITABLE_MASK))
1474                                 continue;
1475                         ++nmaps;
1476                 }
1477         }
1478         return nmaps;
1479 }
1480
1481 static void audit_rmap(struct kvm_vcpu *vcpu)
1482 {
1483         int n_rmap = count_rmaps(vcpu);
1484         int n_actual = count_writable_mappings(vcpu);
1485
1486         if (n_rmap != n_actual)
1487                 printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
1488                        __FUNCTION__, audit_msg, n_rmap, n_actual);
1489 }
1490
1491 static void audit_write_protection(struct kvm_vcpu *vcpu)
1492 {
1493         struct kvm_mmu_page *page;
1494
1495         list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
1496                 hfn_t hfn;
1497                 struct page *pg;
1498
1499                 if (page->role.metaphysical)
1500                         continue;
1501
1502                 hfn = gpa_to_hpa(vcpu, (gpa_t)page->gfn << PAGE_SHIFT)
1503                         >> PAGE_SHIFT;
1504                 pg = pfn_to_page(hfn);
1505                 if (pg->private)
1506                         printk(KERN_ERR "%s: (%s) shadow page has writable"
1507                                " mappings: gfn %lx role %x\n",
1508                                __FUNCTION__, audit_msg, page->gfn,
1509                                page->role.word);
1510         }
1511 }
1512
1513 static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
1514 {
1515         int olddbg = dbg;
1516
1517         dbg = 0;
1518         audit_msg = msg;
1519         audit_rmap(vcpu);
1520         audit_write_protection(vcpu);
1521         audit_mappings(vcpu);
1522         dbg = olddbg;
1523 }
1524
1525 #endif