]> err.no Git - linux-2.6/blob - drivers/kvm/mmu.c
[PATCH] KVM: MMU: Make kvm_mmu_alloc_page() return a kvm_mmu_page pointer
[linux-2.6] / drivers / kvm / mmu.c
1 /*
2  * Kernel-based Virtual Machine driver for Linux
3  *
4  * This module enables machines with Intel VT-x extensions to run virtual
5  * machines without emulation or binary translation.
6  *
7  * MMU support
8  *
9  * Copyright (C) 2006 Qumranet, Inc.
10  *
11  * Authors:
12  *   Yaniv Kamay  <yaniv@qumranet.com>
13  *   Avi Kivity   <avi@qumranet.com>
14  *
15  * This work is licensed under the terms of the GNU GPL, version 2.  See
16  * the COPYING file in the top-level directory.
17  *
18  */
19 #include <linux/types.h>
20 #include <linux/string.h>
21 #include <asm/page.h>
22 #include <linux/mm.h>
23 #include <linux/highmem.h>
24 #include <linux/module.h>
25
26 #include "vmx.h"
27 #include "kvm.h"
28
29 #define pgprintk(x...) do { } while (0)
30 #define rmap_printk(x...) do { } while (0)
31
32 #define ASSERT(x)                                                       \
33         if (!(x)) {                                                     \
34                 printk(KERN_WARNING "assertion failed %s:%d: %s\n",     \
35                        __FILE__, __LINE__, #x);                         \
36         }
37
38 #define PT64_ENT_PER_PAGE 512
39 #define PT32_ENT_PER_PAGE 1024
40
41 #define PT_WRITABLE_SHIFT 1
42
43 #define PT_PRESENT_MASK (1ULL << 0)
44 #define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
45 #define PT_USER_MASK (1ULL << 2)
46 #define PT_PWT_MASK (1ULL << 3)
47 #define PT_PCD_MASK (1ULL << 4)
48 #define PT_ACCESSED_MASK (1ULL << 5)
49 #define PT_DIRTY_MASK (1ULL << 6)
50 #define PT_PAGE_SIZE_MASK (1ULL << 7)
51 #define PT_PAT_MASK (1ULL << 7)
52 #define PT_GLOBAL_MASK (1ULL << 8)
53 #define PT64_NX_MASK (1ULL << 63)
54
55 #define PT_PAT_SHIFT 7
56 #define PT_DIR_PAT_SHIFT 12
57 #define PT_DIR_PAT_MASK (1ULL << PT_DIR_PAT_SHIFT)
58
59 #define PT32_DIR_PSE36_SIZE 4
60 #define PT32_DIR_PSE36_SHIFT 13
61 #define PT32_DIR_PSE36_MASK (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT)
62
63
64 #define PT32_PTE_COPY_MASK \
65         (PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_DIRTY_MASK | PT_GLOBAL_MASK)
66
67 #define PT64_PTE_COPY_MASK (PT64_NX_MASK | PT32_PTE_COPY_MASK)
68
69 #define PT_FIRST_AVAIL_BITS_SHIFT 9
70 #define PT64_SECOND_AVAIL_BITS_SHIFT 52
71
72 #define PT_SHADOW_PS_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
73 #define PT_SHADOW_IO_MARK (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
74
75 #define PT_SHADOW_WRITABLE_SHIFT (PT_FIRST_AVAIL_BITS_SHIFT + 1)
76 #define PT_SHADOW_WRITABLE_MASK (1ULL << PT_SHADOW_WRITABLE_SHIFT)
77
78 #define PT_SHADOW_USER_SHIFT (PT_SHADOW_WRITABLE_SHIFT + 1)
79 #define PT_SHADOW_USER_MASK (1ULL << (PT_SHADOW_USER_SHIFT))
80
81 #define PT_SHADOW_BITS_OFFSET (PT_SHADOW_WRITABLE_SHIFT - PT_WRITABLE_SHIFT)
82
83 #define VALID_PAGE(x) ((x) != INVALID_PAGE)
84
85 #define PT64_LEVEL_BITS 9
86
87 #define PT64_LEVEL_SHIFT(level) \
88                 ( PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS )
89
90 #define PT64_LEVEL_MASK(level) \
91                 (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
92
93 #define PT64_INDEX(address, level)\
94         (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
95
96
97 #define PT32_LEVEL_BITS 10
98
99 #define PT32_LEVEL_SHIFT(level) \
100                 ( PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS )
101
102 #define PT32_LEVEL_MASK(level) \
103                 (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
104
105 #define PT32_INDEX(address, level)\
106         (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
107
108
109 #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & PAGE_MASK)
110 #define PT64_DIR_BASE_ADDR_MASK \
111         (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
112
113 #define PT32_BASE_ADDR_MASK PAGE_MASK
114 #define PT32_DIR_BASE_ADDR_MASK \
115         (PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
116
117
118 #define PFERR_PRESENT_MASK (1U << 0)
119 #define PFERR_WRITE_MASK (1U << 1)
120 #define PFERR_USER_MASK (1U << 2)
121
122 #define PT64_ROOT_LEVEL 4
123 #define PT32_ROOT_LEVEL 2
124 #define PT32E_ROOT_LEVEL 3
125
126 #define PT_DIRECTORY_LEVEL 2
127 #define PT_PAGE_TABLE_LEVEL 1
128
129 #define RMAP_EXT 4
130
131 struct kvm_rmap_desc {
132         u64 *shadow_ptes[RMAP_EXT];
133         struct kvm_rmap_desc *more;
134 };
135
136 static int is_write_protection(struct kvm_vcpu *vcpu)
137 {
138         return vcpu->cr0 & CR0_WP_MASK;
139 }
140
141 static int is_cpuid_PSE36(void)
142 {
143         return 1;
144 }
145
146 static int is_present_pte(unsigned long pte)
147 {
148         return pte & PT_PRESENT_MASK;
149 }
150
151 static int is_writeble_pte(unsigned long pte)
152 {
153         return pte & PT_WRITABLE_MASK;
154 }
155
156 static int is_io_pte(unsigned long pte)
157 {
158         return pte & PT_SHADOW_IO_MARK;
159 }
160
161 static int is_rmap_pte(u64 pte)
162 {
163         return (pte & (PT_WRITABLE_MASK | PT_PRESENT_MASK))
164                 == (PT_WRITABLE_MASK | PT_PRESENT_MASK);
165 }
166
167 /*
168  * Reverse mapping data structures:
169  *
170  * If page->private bit zero is zero, then page->private points to the
171  * shadow page table entry that points to page_address(page).
172  *
173  * If page->private bit zero is one, (then page->private & ~1) points
174  * to a struct kvm_rmap_desc containing more mappings.
175  */
176 static void rmap_add(struct kvm *kvm, u64 *spte)
177 {
178         struct page *page;
179         struct kvm_rmap_desc *desc;
180         int i;
181
182         if (!is_rmap_pte(*spte))
183                 return;
184         page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
185         if (!page->private) {
186                 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
187                 page->private = (unsigned long)spte;
188         } else if (!(page->private & 1)) {
189                 rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
190                 desc = kzalloc(sizeof *desc, GFP_NOWAIT);
191                 if (!desc)
192                         BUG(); /* FIXME: return error */
193                 desc->shadow_ptes[0] = (u64 *)page->private;
194                 desc->shadow_ptes[1] = spte;
195                 page->private = (unsigned long)desc | 1;
196         } else {
197                 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
198                 desc = (struct kvm_rmap_desc *)(page->private & ~1ul);
199                 while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
200                         desc = desc->more;
201                 if (desc->shadow_ptes[RMAP_EXT-1]) {
202                         desc->more = kzalloc(sizeof *desc->more, GFP_NOWAIT);
203                         if (!desc->more)
204                                 BUG(); /* FIXME: return error */
205                         desc = desc->more;
206                 }
207                 for (i = 0; desc->shadow_ptes[i]; ++i)
208                         ;
209                 desc->shadow_ptes[i] = spte;
210         }
211 }
212
213 static void rmap_desc_remove_entry(struct page *page,
214                                    struct kvm_rmap_desc *desc,
215                                    int i,
216                                    struct kvm_rmap_desc *prev_desc)
217 {
218         int j;
219
220         for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
221                 ;
222         desc->shadow_ptes[i] = desc->shadow_ptes[j];
223         desc->shadow_ptes[j] = 0;
224         if (j != 0)
225                 return;
226         if (!prev_desc && !desc->more)
227                 page->private = (unsigned long)desc->shadow_ptes[0];
228         else
229                 if (prev_desc)
230                         prev_desc->more = desc->more;
231                 else
232                         page->private = (unsigned long)desc->more | 1;
233         kfree(desc);
234 }
235
236 static void rmap_remove(struct kvm *kvm, u64 *spte)
237 {
238         struct page *page;
239         struct kvm_rmap_desc *desc;
240         struct kvm_rmap_desc *prev_desc;
241         int i;
242
243         if (!is_rmap_pte(*spte))
244                 return;
245         page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
246         if (!page->private) {
247                 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
248                 BUG();
249         } else if (!(page->private & 1)) {
250                 rmap_printk("rmap_remove:  %p %llx 1->0\n", spte, *spte);
251                 if ((u64 *)page->private != spte) {
252                         printk(KERN_ERR "rmap_remove:  %p %llx 1->BUG\n",
253                                spte, *spte);
254                         BUG();
255                 }
256                 page->private = 0;
257         } else {
258                 rmap_printk("rmap_remove:  %p %llx many->many\n", spte, *spte);
259                 desc = (struct kvm_rmap_desc *)(page->private & ~1ul);
260                 prev_desc = NULL;
261                 while (desc) {
262                         for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
263                                 if (desc->shadow_ptes[i] == spte) {
264                                         rmap_desc_remove_entry(page, desc, i,
265                                                                prev_desc);
266                                         return;
267                                 }
268                         prev_desc = desc;
269                         desc = desc->more;
270                 }
271                 BUG();
272         }
273 }
274
275 static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, hpa_t page_hpa)
276 {
277         struct kvm_mmu_page *page_head = page_header(page_hpa);
278
279         list_del(&page_head->link);
280         page_head->page_hpa = page_hpa;
281         list_add(&page_head->link, &vcpu->free_pages);
282 }
283
284 static int is_empty_shadow_page(hpa_t page_hpa)
285 {
286         u32 *pos;
287         u32 *end;
288         for (pos = __va(page_hpa), end = pos + PAGE_SIZE / sizeof(u32);
289                       pos != end; pos++)
290                 if (*pos != 0)
291                         return 0;
292         return 1;
293 }
294
295 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
296                                                u64 *parent_pte)
297 {
298         struct kvm_mmu_page *page;
299
300         if (list_empty(&vcpu->free_pages))
301                 return NULL;
302
303         page = list_entry(vcpu->free_pages.next, struct kvm_mmu_page, link);
304         list_del(&page->link);
305         list_add(&page->link, &vcpu->kvm->active_mmu_pages);
306         ASSERT(is_empty_shadow_page(page->page_hpa));
307         page->slot_bitmap = 0;
308         page->global = 1;
309         page->parent_pte = parent_pte;
310         return page;
311 }
312
313 static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa)
314 {
315         int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT));
316         struct kvm_mmu_page *page_head = page_header(__pa(pte));
317
318         __set_bit(slot, &page_head->slot_bitmap);
319 }
320
321 hpa_t safe_gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
322 {
323         hpa_t hpa = gpa_to_hpa(vcpu, gpa);
324
325         return is_error_hpa(hpa) ? bad_page_address | (gpa & ~PAGE_MASK): hpa;
326 }
327
328 hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa)
329 {
330         struct kvm_memory_slot *slot;
331         struct page *page;
332
333         ASSERT((gpa & HPA_ERR_MASK) == 0);
334         slot = gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT);
335         if (!slot)
336                 return gpa | HPA_ERR_MASK;
337         page = gfn_to_page(slot, gpa >> PAGE_SHIFT);
338         return ((hpa_t)page_to_pfn(page) << PAGE_SHIFT)
339                 | (gpa & (PAGE_SIZE-1));
340 }
341
342 hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva)
343 {
344         gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
345
346         if (gpa == UNMAPPED_GVA)
347                 return UNMAPPED_GVA;
348         return gpa_to_hpa(vcpu, gpa);
349 }
350
351
352 static void release_pt_page_64(struct kvm_vcpu *vcpu, hpa_t page_hpa,
353                                int level)
354 {
355         u64 *pos;
356         u64 *end;
357
358         ASSERT(vcpu);
359         ASSERT(VALID_PAGE(page_hpa));
360         ASSERT(level <= PT64_ROOT_LEVEL && level > 0);
361
362         for (pos = __va(page_hpa), end = pos + PT64_ENT_PER_PAGE;
363              pos != end; pos++) {
364                 u64 current_ent = *pos;
365
366                 if (is_present_pte(current_ent)) {
367                         if (level != 1)
368                                 release_pt_page_64(vcpu,
369                                                   current_ent &
370                                                   PT64_BASE_ADDR_MASK,
371                                                   level - 1);
372                         else
373                                 rmap_remove(vcpu->kvm, pos);
374                 }
375                 *pos = 0;
376         }
377         kvm_mmu_free_page(vcpu, page_hpa);
378 }
379
380 static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
381 {
382 }
383
384 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p)
385 {
386         int level = PT32E_ROOT_LEVEL;
387         hpa_t table_addr = vcpu->mmu.root_hpa;
388
389         for (; ; level--) {
390                 u32 index = PT64_INDEX(v, level);
391                 u64 *table;
392
393                 ASSERT(VALID_PAGE(table_addr));
394                 table = __va(table_addr);
395
396                 if (level == 1) {
397                         mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT);
398                         page_header_update_slot(vcpu->kvm, table, v);
399                         table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK |
400                                                                 PT_USER_MASK;
401                         rmap_add(vcpu->kvm, &table[index]);
402                         return 0;
403                 }
404
405                 if (table[index] == 0) {
406                         struct kvm_mmu_page *new_table;
407
408                         new_table = kvm_mmu_alloc_page(vcpu, &table[index]);
409                         if (!new_table) {
410                                 pgprintk("nonpaging_map: ENOMEM\n");
411                                 return -ENOMEM;
412                         }
413
414                         table[index] = new_table->page_hpa | PT_PRESENT_MASK
415                                 | PT_WRITABLE_MASK | PT_USER_MASK;
416                 }
417                 table_addr = table[index] & PT64_BASE_ADDR_MASK;
418         }
419 }
420
421 static void mmu_free_roots(struct kvm_vcpu *vcpu)
422 {
423         int i;
424
425 #ifdef CONFIG_X86_64
426         if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
427                 hpa_t root = vcpu->mmu.root_hpa;
428
429                 ASSERT(VALID_PAGE(root));
430                 release_pt_page_64(vcpu, root, PT64_ROOT_LEVEL);
431                 vcpu->mmu.root_hpa = INVALID_PAGE;
432                 return;
433         }
434 #endif
435         for (i = 0; i < 4; ++i) {
436                 hpa_t root = vcpu->mmu.pae_root[i];
437
438                 ASSERT(VALID_PAGE(root));
439                 root &= PT64_BASE_ADDR_MASK;
440                 release_pt_page_64(vcpu, root, PT32E_ROOT_LEVEL - 1);
441                 vcpu->mmu.pae_root[i] = INVALID_PAGE;
442         }
443         vcpu->mmu.root_hpa = INVALID_PAGE;
444 }
445
446 static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
447 {
448         int i;
449
450 #ifdef CONFIG_X86_64
451         if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
452                 hpa_t root = vcpu->mmu.root_hpa;
453
454                 ASSERT(!VALID_PAGE(root));
455                 root = kvm_mmu_alloc_page(vcpu, NULL)->page_hpa;
456                 vcpu->mmu.root_hpa = root;
457                 return;
458         }
459 #endif
460         for (i = 0; i < 4; ++i) {
461                 hpa_t root = vcpu->mmu.pae_root[i];
462
463                 ASSERT(!VALID_PAGE(root));
464                 root = kvm_mmu_alloc_page(vcpu, NULL)->page_hpa;
465                 vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK;
466         }
467         vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root);
468 }
469
470 static void nonpaging_flush(struct kvm_vcpu *vcpu)
471 {
472         hpa_t root = vcpu->mmu.root_hpa;
473
474         ++kvm_stat.tlb_flush;
475         pgprintk("nonpaging_flush\n");
476         mmu_free_roots(vcpu);
477         mmu_alloc_roots(vcpu);
478         kvm_arch_ops->set_cr3(vcpu, root);
479         kvm_arch_ops->tlb_flush(vcpu);
480 }
481
482 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
483 {
484         return vaddr;
485 }
486
487 static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
488                                u32 error_code)
489 {
490         int ret;
491         gpa_t addr = gva;
492
493         ASSERT(vcpu);
494         ASSERT(VALID_PAGE(vcpu->mmu.root_hpa));
495
496         for (;;) {
497              hpa_t paddr;
498
499              paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK);
500
501              if (is_error_hpa(paddr))
502                      return 1;
503
504              ret = nonpaging_map(vcpu, addr & PAGE_MASK, paddr);
505              if (ret) {
506                      nonpaging_flush(vcpu);
507                      continue;
508              }
509              break;
510         }
511         return ret;
512 }
513
514 static void nonpaging_inval_page(struct kvm_vcpu *vcpu, gva_t addr)
515 {
516 }
517
518 static void nonpaging_free(struct kvm_vcpu *vcpu)
519 {
520         mmu_free_roots(vcpu);
521 }
522
523 static int nonpaging_init_context(struct kvm_vcpu *vcpu)
524 {
525         struct kvm_mmu *context = &vcpu->mmu;
526
527         context->new_cr3 = nonpaging_new_cr3;
528         context->page_fault = nonpaging_page_fault;
529         context->inval_page = nonpaging_inval_page;
530         context->gva_to_gpa = nonpaging_gva_to_gpa;
531         context->free = nonpaging_free;
532         context->root_level = PT32E_ROOT_LEVEL;
533         context->shadow_root_level = PT32E_ROOT_LEVEL;
534         mmu_alloc_roots(vcpu);
535         ASSERT(VALID_PAGE(context->root_hpa));
536         kvm_arch_ops->set_cr3(vcpu, context->root_hpa);
537         return 0;
538 }
539
540
541 static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
542 {
543         struct kvm_mmu_page *page, *npage;
544
545         list_for_each_entry_safe(page, npage, &vcpu->kvm->active_mmu_pages,
546                                  link) {
547                 if (page->global)
548                         continue;
549
550                 if (!page->parent_pte)
551                         continue;
552
553                 *page->parent_pte = 0;
554                 release_pt_page_64(vcpu, page->page_hpa, 1);
555         }
556         ++kvm_stat.tlb_flush;
557         kvm_arch_ops->tlb_flush(vcpu);
558 }
559
560 static void paging_new_cr3(struct kvm_vcpu *vcpu)
561 {
562         kvm_mmu_flush_tlb(vcpu);
563 }
564
565 static void mark_pagetable_nonglobal(void *shadow_pte)
566 {
567         page_header(__pa(shadow_pte))->global = 0;
568 }
569
570 static inline void set_pte_common(struct kvm_vcpu *vcpu,
571                              u64 *shadow_pte,
572                              gpa_t gaddr,
573                              int dirty,
574                              u64 access_bits)
575 {
576         hpa_t paddr;
577
578         *shadow_pte |= access_bits << PT_SHADOW_BITS_OFFSET;
579         if (!dirty)
580                 access_bits &= ~PT_WRITABLE_MASK;
581
582         if (access_bits & PT_WRITABLE_MASK)
583                 mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT);
584
585         *shadow_pte |= access_bits;
586
587         paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK);
588
589         if (!(*shadow_pte & PT_GLOBAL_MASK))
590                 mark_pagetable_nonglobal(shadow_pte);
591
592         if (is_error_hpa(paddr)) {
593                 *shadow_pte |= gaddr;
594                 *shadow_pte |= PT_SHADOW_IO_MARK;
595                 *shadow_pte &= ~PT_PRESENT_MASK;
596         } else {
597                 *shadow_pte |= paddr;
598                 page_header_update_slot(vcpu->kvm, shadow_pte, gaddr);
599                 rmap_add(vcpu->kvm, shadow_pte);
600         }
601 }
602
603 static void inject_page_fault(struct kvm_vcpu *vcpu,
604                               u64 addr,
605                               u32 err_code)
606 {
607         kvm_arch_ops->inject_page_fault(vcpu, addr, err_code);
608 }
609
610 static inline int fix_read_pf(u64 *shadow_ent)
611 {
612         if ((*shadow_ent & PT_SHADOW_USER_MASK) &&
613             !(*shadow_ent & PT_USER_MASK)) {
614                 /*
615                  * If supervisor write protect is disabled, we shadow kernel
616                  * pages as user pages so we can trap the write access.
617                  */
618                 *shadow_ent |= PT_USER_MASK;
619                 *shadow_ent &= ~PT_WRITABLE_MASK;
620
621                 return 1;
622
623         }
624         return 0;
625 }
626
627 static int may_access(u64 pte, int write, int user)
628 {
629
630         if (user && !(pte & PT_USER_MASK))
631                 return 0;
632         if (write && !(pte & PT_WRITABLE_MASK))
633                 return 0;
634         return 1;
635 }
636
637 /*
638  * Remove a shadow pte.
639  */
640 static void paging_inval_page(struct kvm_vcpu *vcpu, gva_t addr)
641 {
642         hpa_t page_addr = vcpu->mmu.root_hpa;
643         int level = vcpu->mmu.shadow_root_level;
644
645         ++kvm_stat.invlpg;
646
647         for (; ; level--) {
648                 u32 index = PT64_INDEX(addr, level);
649                 u64 *table = __va(page_addr);
650
651                 if (level == PT_PAGE_TABLE_LEVEL ) {
652                         rmap_remove(vcpu->kvm, &table[index]);
653                         table[index] = 0;
654                         return;
655                 }
656
657                 if (!is_present_pte(table[index]))
658                         return;
659
660                 page_addr = table[index] & PT64_BASE_ADDR_MASK;
661
662                 if (level == PT_DIRECTORY_LEVEL &&
663                           (table[index] & PT_SHADOW_PS_MARK)) {
664                         table[index] = 0;
665                         release_pt_page_64(vcpu, page_addr, PT_PAGE_TABLE_LEVEL);
666
667                         kvm_arch_ops->tlb_flush(vcpu);
668                         return;
669                 }
670         }
671 }
672
673 static void paging_free(struct kvm_vcpu *vcpu)
674 {
675         nonpaging_free(vcpu);
676 }
677
678 #define PTTYPE 64
679 #include "paging_tmpl.h"
680 #undef PTTYPE
681
682 #define PTTYPE 32
683 #include "paging_tmpl.h"
684 #undef PTTYPE
685
686 static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
687 {
688         struct kvm_mmu *context = &vcpu->mmu;
689
690         ASSERT(is_pae(vcpu));
691         context->new_cr3 = paging_new_cr3;
692         context->page_fault = paging64_page_fault;
693         context->inval_page = paging_inval_page;
694         context->gva_to_gpa = paging64_gva_to_gpa;
695         context->free = paging_free;
696         context->root_level = level;
697         context->shadow_root_level = level;
698         mmu_alloc_roots(vcpu);
699         ASSERT(VALID_PAGE(context->root_hpa));
700         kvm_arch_ops->set_cr3(vcpu, context->root_hpa |
701                     (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK)));
702         return 0;
703 }
704
705 static int paging64_init_context(struct kvm_vcpu *vcpu)
706 {
707         return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
708 }
709
710 static int paging32_init_context(struct kvm_vcpu *vcpu)
711 {
712         struct kvm_mmu *context = &vcpu->mmu;
713
714         context->new_cr3 = paging_new_cr3;
715         context->page_fault = paging32_page_fault;
716         context->inval_page = paging_inval_page;
717         context->gva_to_gpa = paging32_gva_to_gpa;
718         context->free = paging_free;
719         context->root_level = PT32_ROOT_LEVEL;
720         context->shadow_root_level = PT32E_ROOT_LEVEL;
721         mmu_alloc_roots(vcpu);
722         ASSERT(VALID_PAGE(context->root_hpa));
723         kvm_arch_ops->set_cr3(vcpu, context->root_hpa |
724                     (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK)));
725         return 0;
726 }
727
728 static int paging32E_init_context(struct kvm_vcpu *vcpu)
729 {
730         return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
731 }
732
733 static int init_kvm_mmu(struct kvm_vcpu *vcpu)
734 {
735         ASSERT(vcpu);
736         ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
737
738         if (!is_paging(vcpu))
739                 return nonpaging_init_context(vcpu);
740         else if (is_long_mode(vcpu))
741                 return paging64_init_context(vcpu);
742         else if (is_pae(vcpu))
743                 return paging32E_init_context(vcpu);
744         else
745                 return paging32_init_context(vcpu);
746 }
747
748 static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
749 {
750         ASSERT(vcpu);
751         if (VALID_PAGE(vcpu->mmu.root_hpa)) {
752                 vcpu->mmu.free(vcpu);
753                 vcpu->mmu.root_hpa = INVALID_PAGE;
754         }
755 }
756
757 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
758 {
759         destroy_kvm_mmu(vcpu);
760         return init_kvm_mmu(vcpu);
761 }
762
763 static void free_mmu_pages(struct kvm_vcpu *vcpu)
764 {
765         while (!list_empty(&vcpu->free_pages)) {
766                 struct kvm_mmu_page *page;
767
768                 page = list_entry(vcpu->free_pages.next,
769                                   struct kvm_mmu_page, link);
770                 list_del(&page->link);
771                 __free_page(pfn_to_page(page->page_hpa >> PAGE_SHIFT));
772                 page->page_hpa = INVALID_PAGE;
773         }
774         free_page((unsigned long)vcpu->mmu.pae_root);
775 }
776
777 static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
778 {
779         struct page *page;
780         int i;
781
782         ASSERT(vcpu);
783
784         for (i = 0; i < KVM_NUM_MMU_PAGES; i++) {
785                 struct kvm_mmu_page *page_header = &vcpu->page_header_buf[i];
786
787                 INIT_LIST_HEAD(&page_header->link);
788                 if ((page = alloc_page(GFP_KERNEL)) == NULL)
789                         goto error_1;
790                 page->private = (unsigned long)page_header;
791                 page_header->page_hpa = (hpa_t)page_to_pfn(page) << PAGE_SHIFT;
792                 memset(__va(page_header->page_hpa), 0, PAGE_SIZE);
793                 list_add(&page_header->link, &vcpu->free_pages);
794         }
795
796         /*
797          * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
798          * Therefore we need to allocate shadow page tables in the first
799          * 4GB of memory, which happens to fit the DMA32 zone.
800          */
801         page = alloc_page(GFP_KERNEL | __GFP_DMA32);
802         if (!page)
803                 goto error_1;
804         vcpu->mmu.pae_root = page_address(page);
805         for (i = 0; i < 4; ++i)
806                 vcpu->mmu.pae_root[i] = INVALID_PAGE;
807
808         return 0;
809
810 error_1:
811         free_mmu_pages(vcpu);
812         return -ENOMEM;
813 }
814
815 int kvm_mmu_create(struct kvm_vcpu *vcpu)
816 {
817         ASSERT(vcpu);
818         ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
819         ASSERT(list_empty(&vcpu->free_pages));
820
821         return alloc_mmu_pages(vcpu);
822 }
823
824 int kvm_mmu_setup(struct kvm_vcpu *vcpu)
825 {
826         ASSERT(vcpu);
827         ASSERT(!VALID_PAGE(vcpu->mmu.root_hpa));
828         ASSERT(!list_empty(&vcpu->free_pages));
829
830         return init_kvm_mmu(vcpu);
831 }
832
833 void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
834 {
835         ASSERT(vcpu);
836
837         destroy_kvm_mmu(vcpu);
838         free_mmu_pages(vcpu);
839 }
840
841 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
842 {
843         struct kvm_mmu_page *page;
844
845         list_for_each_entry(page, &kvm->active_mmu_pages, link) {
846                 int i;
847                 u64 *pt;
848
849                 if (!test_bit(slot, &page->slot_bitmap))
850                         continue;
851
852                 pt = __va(page->page_hpa);
853                 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
854                         /* avoid RMW */
855                         if (pt[i] & PT_WRITABLE_MASK) {
856                                 rmap_remove(kvm, &pt[i]);
857                                 pt[i] &= ~PT_WRITABLE_MASK;
858                         }
859         }
860 }