[PATCH] powerpc: Fix pagetable bloat for hugepages

author David Gibson <david@gibson.dropbear.id.au>

Fri, 28 Apr 2006 05:02:51 +0000 (15:02 +1000)

committer Paul Mackerras <paulus@samba.org>

Fri, 28 Apr 2006 05:02:51 +0000 (15:02 +1000)
author David Gibson <david@gibson.dropbear.id.au>
Fri, 28 Apr 2006 05:02:51 +0000 (15:02 +1000)
committer Paul Mackerras <paulus@samba.org>
Fri, 28 Apr 2006 05:02:51 +0000 (15:02 +1000)
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c

index 7370f9f33e2943d038025a6d05ba6b4e02755beb..266b8b2ceac947071d225e4de981d2229d499fc2 100644 (file)
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -30,13 +30,66 @@
  #define NUM_LOW_AREAS  (0x100000000UL >> SID_SHIFT)
  #define NUM_HIGH_AREAS (PGTABLE_RANGE >> HTLB_AREA_SHIFT)
  
+#ifdef CONFIG_PPC_64K_PAGES
+#define HUGEPTE_INDEX_SIZE     (PMD_SHIFT-HPAGE_SHIFT)
+#else
+#define HUGEPTE_INDEX_SIZE     (PUD_SHIFT-HPAGE_SHIFT)
+#endif
+#define PTRS_PER_HUGEPTE       (1 << HUGEPTE_INDEX_SIZE)
+#define HUGEPTE_TABLE_SIZE     (sizeof(pte_t) << HUGEPTE_INDEX_SIZE)
+
+#define HUGEPD_SHIFT           (HPAGE_SHIFT + HUGEPTE_INDEX_SIZE)
+#define HUGEPD_SIZE            (1UL << HUGEPD_SHIFT)
+#define HUGEPD_MASK            (~(HUGEPD_SIZE-1))
+
+#define huge_pgtable_cache     (pgtable_cache[HUGEPTE_CACHE_NUM])
+
+/* Flag to mark huge PD pointers.  This means pmd_bad() and pud_bad()
+ * will choke on pointers to hugepte tables, which is handy for
+ * catching screwups early. */
+#define HUGEPD_OK      0x1
+
+typedef struct { unsigned long pd; } hugepd_t;
+
+#define hugepd_none(hpd)       ((hpd).pd == 0)
+
+static inline pte_t *hugepd_page(hugepd_t hpd)
+{
+       BUG_ON(!(hpd.pd & HUGEPD_OK));
+       return (pte_t *)(hpd.pd & ~HUGEPD_OK);
+}
+
+static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr)
+{
+       unsigned long idx = ((addr >> HPAGE_SHIFT) & (PTRS_PER_HUGEPTE-1));
+       pte_t *dir = hugepd_page(*hpdp);
+
+       return dir + idx;
+}
+
+static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
+                          unsigned long address)
+{
+       pte_t *new = kmem_cache_alloc(huge_pgtable_cache,
+                                     GFP_KERNEL|__GFP_REPEAT);
+
+       if (! new)
+               return -ENOMEM;
+
+       spin_lock(&mm->page_table_lock);
+       if (!hugepd_none(*hpdp))
+               kmem_cache_free(huge_pgtable_cache, new);
+       else
+               hpdp->pd = (unsigned long)new | HUGEPD_OK;
+       spin_unlock(&mm->page_table_lock);
+       return 0;
+}
+
  /* Modelled after find_linux_pte() */
  pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
  {
         pgd_t *pg;
         pud_t *pu;
-       pmd_t *pm;
-       pte_t *pt;
  
         BUG_ON(! in_hugepage_area(mm->context, addr));
  
@@ -46,26 +99,14 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
         if (!pgd_none(*pg)) {
                 pu = pud_offset(pg, addr);
                 if (!pud_none(*pu)) {
-                       pm = pmd_offset(pu, addr);
  #ifdef CONFIG_PPC_64K_PAGES
-                       /* Currently, we use the normal PTE offset within full
-                        * size PTE pages, thus our huge PTEs are scattered in
-                        * the PTE page and we do waste some. We may change
-                        * that in the future, but the current mecanism keeps
-                        * things much simpler
-                        */
-                       if (!pmd_none(*pm)) {
-                               /* Note: pte_offset_* are all equivalent on
-                                * ppc64 as we don't have HIGHMEM
-                                */
-                               pt = pte_offset_kernel(pm, addr);
-                               return pt;
-                       }
-#else /* CONFIG_PPC_64K_PAGES */
-                       /* On 4k pages, we put huge PTEs in the PMD page */
-                       pt = (pte_t *)pm;
-                       return pt;
-#endif /* CONFIG_PPC_64K_PAGES */
+                       pmd_t *pm;
+                       pm = pmd_offset(pu, addr);
+                       if (!pmd_none(*pm))
+                               return hugepte_offset((hugepd_t *)pm, addr);
+#else
+                       return hugepte_offset((hugepd_t *)pu, addr);
+#endif
                 }
         }
  
@@ -76,8 +117,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
  {
         pgd_t *pg;
         pud_t *pu;
-       pmd_t *pm;
-       pte_t *pt;
+       hugepd_t *hpdp = NULL;
  
         BUG_ON(! in_hugepage_area(mm->context, addr));
  
@@ -87,23 +127,182 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
         pu = pud_alloc(mm, pg, addr);
  
         if (pu) {
+#ifdef CONFIG_PPC_64K_PAGES
+               pmd_t *pm;
                 pm = pmd_alloc(mm, pu, addr);
-               if (pm) {
+               if (pm)
+                       hpdp = (hugepd_t *)pm;
+#else
+               hpdp = (hugepd_t *)pu;
+#endif
+       }
+
+       if (! hpdp)
+               return NULL;
+
+       if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr))
+               return NULL;
+
+       return hugepte_offset(hpdp, addr);
+}
+
+static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp)
+{
+       pte_t *hugepte = hugepd_page(*hpdp);
+
+       hpdp->pd = 0;
+       tlb->need_flush = 1;
+       pgtable_free_tlb(tlb, pgtable_free_cache(hugepte, HUGEPTE_CACHE_NUM,
+                                                HUGEPTE_TABLE_SIZE-1));
+}
+
  #ifdef CONFIG_PPC_64K_PAGES
-                       /* See comment in huge_pte_offset. Note that if we ever
-                        * want to put the page size in the PMD, we would have
-                        * to open code our own pte_alloc* function in order
-                        * to populate and set the size atomically
-                        */
-                       pt = pte_alloc_map(mm, pm, addr);
-#else /* CONFIG_PPC_64K_PAGES */
-                       pt = (pte_t *)pm;
-#endif /* CONFIG_PPC_64K_PAGES */
-                       return pt;
-               }
+static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
+                                  unsigned long addr, unsigned long end,
+                                  unsigned long floor, unsigned long ceiling)
+{
+       pmd_t *pmd;
+       unsigned long next;
+       unsigned long start;
+
+       start = addr;
+       pmd = pmd_offset(pud, addr);
+       do {
+               next = pmd_addr_end(addr, end);
+               if (pmd_none(*pmd))
+                       continue;
+               free_hugepte_range(tlb, (hugepd_t *)pmd);
+       } while (pmd++, addr = next, addr != end);
+
+       start &= PUD_MASK;
+       if (start < floor)
+               return;
+       if (ceiling) {
+               ceiling &= PUD_MASK;
+               if (!ceiling)
+                       return;
         }
+       if (end - 1 > ceiling - 1)
+               return;
  
-       return NULL;
+       pmd = pmd_offset(pud, start);
+       pud_clear(pud);
+       pmd_free_tlb(tlb, pmd);
+}
+#endif
+
+static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
+                                  unsigned long addr, unsigned long end,
+                                  unsigned long floor, unsigned long ceiling)
+{
+       pud_t *pud;
+       unsigned long next;
+       unsigned long start;
+
+       start = addr;
+       pud = pud_offset(pgd, addr);
+       do {
+               next = pud_addr_end(addr, end);
+#ifdef CONFIG_PPC_64K_PAGES
+               if (pud_none_or_clear_bad(pud))
+                       continue;
+               hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling);
+#else
+               if (pud_none(*pud))
+                       continue;
+               free_hugepte_range(tlb, (hugepd_t *)pud);
+#endif
+       } while (pud++, addr = next, addr != end);
+
+       start &= PGDIR_MASK;
+       if (start < floor)
+               return;
+       if (ceiling) {
+               ceiling &= PGDIR_MASK;
+               if (!ceiling)
+                       return;
+       }
+       if (end - 1 > ceiling - 1)
+               return;
+
+       pud = pud_offset(pgd, start);
+       pgd_clear(pgd);
+       pud_free_tlb(tlb, pud);
+}
+
+/*
+ * This function frees user-level page tables of a process.
+ *
+ * Must be called with pagetable lock held.
+ */
+void hugetlb_free_pgd_range(struct mmu_gather **tlb,
+                           unsigned long addr, unsigned long end,
+                           unsigned long floor, unsigned long ceiling)
+{
+       pgd_t *pgd;
+       unsigned long next;
+       unsigned long start;
+
+       /*
+        * Comments below take from the normal free_pgd_range().  They
+        * apply here too.  The tests against HUGEPD_MASK below are
+        * essential, because we *don't* test for this at the bottom
+        * level.  Without them we'll attempt to free a hugepte table
+        * when we unmap just part of it, even if there are other
+        * active mappings using it.
+        *
+        * The next few lines have given us lots of grief...
+        *
+        * Why are we testing HUGEPD* at this top level?  Because
+        * often there will be no work to do at all, and we'd prefer
+        * not to go all the way down to the bottom just to discover
+        * that.
+        *
+        * Why all these "- 1"s?  Because 0 represents both the bottom
+        * of the address space and the top of it (using -1 for the
+        * top wouldn't help much: the masks would do the wrong thing).
+        * The rule is that addr 0 and floor 0 refer to the bottom of
+        * the address space, but end 0 and ceiling 0 refer to the top
+        * Comparisons need to use "end - 1" and "ceiling - 1" (though
+        * that end 0 case should be mythical).
+        *
+        * Wherever addr is brought up or ceiling brought down, we
+        * must be careful to reject "the opposite 0" before it
+        * confuses the subsequent tests.  But what about where end is
+        * brought down by HUGEPD_SIZE below? no, end can't go down to
+        * 0 there.
+        *
+        * Whereas we round start (addr) and ceiling down, by different
+        * masks at different levels, in order to test whether a table
+        * now has no other vmas using it, so can be freed, we don't
+        * bother to round floor or end up - the tests don't need that.
+        */
+
+       addr &= HUGEPD_MASK;
+       if (addr < floor) {
+               addr += HUGEPD_SIZE;
+               if (!addr)
+                       return;
+       }
+       if (ceiling) {
+               ceiling &= HUGEPD_MASK;
+               if (!ceiling)
+                       return;
+       }
+       if (end - 1 > ceiling - 1)
+               end -= HUGEPD_SIZE;
+       if (addr > end - 1)
+               return;
+
+       start = addr;
+       pgd = pgd_offset((*tlb)->mm, addr);
+       do {
+               BUG_ON(! in_hugepage_area((*tlb)->mm->context, addr));
+               next = pgd_addr_end(addr, end);
+               if (pgd_none_or_clear_bad(pgd))
+                       continue;
+               hugetlb_free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
+       } while (pgd++, addr = next, addr != end);
  }
  
  void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
@@ -841,3 +1040,27 @@ repeat:
   out:
         return err;
  }
+
+static void zero_ctor(void *addr, kmem_cache_t *cache, unsigned long flags)
+{
+       memset(addr, 0, kmem_cache_size(cache));
+}
+
+static int __init hugetlbpage_init(void)
+{
+       if (!cpu_has_feature(CPU_FTR_16M_PAGE))
+               return -ENODEV;
+
+       huge_pgtable_cache = kmem_cache_create("hugepte_cache",
+                                              HUGEPTE_TABLE_SIZE,
+                                              HUGEPTE_TABLE_SIZE,
+                                              SLAB_HWCACHE_ALIGN |
+                                              SLAB_MUST_HWCACHE_ALIGN,
+                                              zero_ctor, NULL);
+       if (! huge_pgtable_cache)
+               panic("hugetlbpage_init(): could not create hugepte cache\n");
+
+       return 0;
+}
+
+module_init(hugetlbpage_init);
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c

index babebd15bdc49d7dc1fd905c8450948673c35876..9e30f968c184af90dfad58f5ff357edfe81e0ed3 100644 (file)
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -162,7 +162,14 @@ static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = {
  };
  #endif /* CONFIG_PPC_64K_PAGES */
  
+#ifdef CONFIG_HUGETLB_PAGE
+/* Hugepages need one extra cache, initialized in hugetlbpage.c.  We
+ * can't put into the tables above, because HPAGE_SHIFT is not compile
+ * time constant. */
+kmem_cache_t *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)+1];
+#else
  kmem_cache_t *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)];
+#endif
  
  void pgtable_cache_init(void)
  {
diff --git a/include/asm-powerpc/page_64.h b/include/asm-powerpc/page_64.h

index 3fb061bab9ecd75f0fc912d23fc7cbb71f5c45c7..eab779c219958b18b82afd1ff5c55bc5de05f89b 100644 (file)
--- a/include/asm-powerpc/page_64.h
+++ b/include/asm-powerpc/page_64.h
@@ -101,6 +101,7 @@ extern unsigned int HPAGE_SHIFT;
                                       - (1U << GET_HTLB_AREA(addr))) & 0xffff)
  
  #define ARCH_HAS_HUGEPAGE_ONLY_RANGE
+#define ARCH_HAS_HUGETLB_FREE_PGD_RANGE
  #define ARCH_HAS_PREPARE_HUGEPAGE_RANGE
  #define ARCH_HAS_SETCLEAR_HUGE_PTE
  
diff --git a/include/asm-powerpc/pgalloc.h b/include/asm-powerpc/pgalloc.h

index a00ee002cd1165f9eb298a4112a4a74712c05966..9f0917c6865978a0c39dfd51a6194a06cb7f4a04 100644 (file)
--- a/include/asm-powerpc/pgalloc.h
+++ b/include/asm-powerpc/pgalloc.h
@@ -17,11 +17,13 @@ extern kmem_cache_t *pgtable_cache[];
  #define PTE_CACHE_NUM  0
  #define PMD_CACHE_NUM  1
  #define PGD_CACHE_NUM  2
+#define HUGEPTE_CACHE_NUM 3
  #else
  #define PTE_CACHE_NUM  0
  #define PMD_CACHE_NUM  1
  #define PUD_CACHE_NUM  1
  #define PGD_CACHE_NUM  0
+#define HUGEPTE_CACHE_NUM 2
  #endif
  
  /*
author	David Gibson <david@gibson.dropbear.id.au>
	Fri, 28 Apr 2006 05:02:51 +0000 (15:02 +1000)
committer	Paul Mackerras <paulus@samba.org>
	Fri, 28 Apr 2006 05:02:51 +0000 (15:02 +1000)
arch/powerpc/mm/hugetlbpage.c		patch \| blob \| history
arch/powerpc/mm/init_64.c		patch \| blob \| history
include/asm-powerpc/page_64.h		patch \| blob \| history
include/asm-powerpc/pgalloc.h		patch \| blob \| history