From f10a04c034c7285a1b15dfa4a83d3e56578e34e8 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Fri, 28 Apr 2006 15:02:51 +1000
Subject: [PATCH] [PATCH] powerpc: Fix pagetable bloat for hugepages

At present, ARCH=powerpc kernels can waste considerable space in
pagetables when making large hugepage mappings.  Hugepage PTEs go in
PMD pages, but each PMD page maps 256M and so contains only 16
hugepage PTEs (128 bytes of data), but takes up a 1024 byte
allocation.  With CONFIG_PPC_64K_PAGES enabled (64k base page size),
the situation is worse.  Now hugepage PTEs are at the PTE page level
(also mapping 256M), so we store 16 hugepage PTEs in a 64k allocation.

The PowerPC MMU already means that any 256M region is either all
hugepage, or all normal pages.  Thus, with some care, we can use a
different allocation for the hugepage PTE tables and only allocate the
128 bytes necessary.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/mm/hugetlbpage.c | 295 +++++++++++++++++++++++++++++-----
 arch/powerpc/mm/init_64.c     |   7 +
 include/asm-powerpc/page_64.h |   1 +
 include/asm-powerpc/pgalloc.h |   2 +
 4 files changed, 269 insertions(+), 36 deletions(-)

diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 7370f9f33e..266b8b2cea 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -30,13 +30,66 @@
 #define NUM_LOW_AREAS	(0x100000000UL >> SID_SHIFT)
 #define NUM_HIGH_AREAS	(PGTABLE_RANGE >> HTLB_AREA_SHIFT)
 
+#ifdef CONFIG_PPC_64K_PAGES
+#define HUGEPTE_INDEX_SIZE	(PMD_SHIFT-HPAGE_SHIFT)
+#else
+#define HUGEPTE_INDEX_SIZE	(PUD_SHIFT-HPAGE_SHIFT)
+#endif
+#define PTRS_PER_HUGEPTE	(1 << HUGEPTE_INDEX_SIZE)
+#define HUGEPTE_TABLE_SIZE	(sizeof(pte_t) << HUGEPTE_INDEX_SIZE)
+
+#define HUGEPD_SHIFT		(HPAGE_SHIFT + HUGEPTE_INDEX_SIZE)
+#define HUGEPD_SIZE		(1UL << HUGEPD_SHIFT)
+#define HUGEPD_MASK		(~(HUGEPD_SIZE-1))
+
+#define huge_pgtable_cache	(pgtable_cache[HUGEPTE_CACHE_NUM])
+
+/* Flag to mark huge PD pointers.  This means pmd_bad() and pud_bad()
+ * will choke on pointers to hugepte tables, which is handy for
+ * catching screwups early. */
+#define HUGEPD_OK	0x1
+
+typedef struct { unsigned long pd; } hugepd_t;
+
+#define hugepd_none(hpd)	((hpd).pd == 0)
+
+static inline pte_t *hugepd_page(hugepd_t hpd)
+{
+	BUG_ON(!(hpd.pd & HUGEPD_OK));
+	return (pte_t *)(hpd.pd & ~HUGEPD_OK);
+}
+
+static inline pte_t *hugepte_offset(hugepd_t *hpdp, unsigned long addr)
+{
+	unsigned long idx = ((addr >> HPAGE_SHIFT) & (PTRS_PER_HUGEPTE-1));
+	pte_t *dir = hugepd_page(*hpdp);
+
+	return dir + idx;
+}
+
+static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
+			   unsigned long address)
+{
+	pte_t *new = kmem_cache_alloc(huge_pgtable_cache,
+				      GFP_KERNEL|__GFP_REPEAT);
+
+	if (! new)
+		return -ENOMEM;
+
+	spin_lock(&mm->page_table_lock);
+	if (!hugepd_none(*hpdp))
+		kmem_cache_free(huge_pgtable_cache, new);
+	else
+		hpdp->pd = (unsigned long)new | HUGEPD_OK;
+	spin_unlock(&mm->page_table_lock);
+	return 0;
+}
+
 /* Modelled after find_linux_pte() */
 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pg;
 	pud_t *pu;
-	pmd_t *pm;
-	pte_t *pt;
 
 	BUG_ON(! in_hugepage_area(mm->context, addr));
 
@@ -46,26 +99,14 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 	if (!pgd_none(*pg)) {
 		pu = pud_offset(pg, addr);
 		if (!pud_none(*pu)) {
-			pm = pmd_offset(pu, addr);
 #ifdef CONFIG_PPC_64K_PAGES
-			/* Currently, we use the normal PTE offset within full
-			 * size PTE pages, thus our huge PTEs are scattered in
-			 * the PTE page and we do waste some. We may change
-			 * that in the future, but the current mecanism keeps
-			 * things much simpler
-			 */
-			if (!pmd_none(*pm)) {
-				/* Note: pte_offset_* are all equivalent on
-				 * ppc64 as we don't have HIGHMEM
-				 */
-				pt = pte_offset_kernel(pm, addr);
-				return pt;
-			}
-#else /* CONFIG_PPC_64K_PAGES */
-			/* On 4k pages, we put huge PTEs in the PMD page */
-			pt = (pte_t *)pm;
-			return pt;
-#endif /* CONFIG_PPC_64K_PAGES */
+			pmd_t *pm;
+			pm = pmd_offset(pu, addr);
+			if (!pmd_none(*pm))
+				return hugepte_offset((hugepd_t *)pm, addr);
+#else
+			return hugepte_offset((hugepd_t *)pu, addr);
+#endif
 		}
 	}
 
@@ -76,8 +117,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pg;
 	pud_t *pu;
-	pmd_t *pm;
-	pte_t *pt;
+	hugepd_t *hpdp = NULL;
 
 	BUG_ON(! in_hugepage_area(mm->context, addr));
 
@@ -87,23 +127,182 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 	pu = pud_alloc(mm, pg, addr);
 
 	if (pu) {
+#ifdef CONFIG_PPC_64K_PAGES
+		pmd_t *pm;
 		pm = pmd_alloc(mm, pu, addr);
-		if (pm) {
+		if (pm)
+			hpdp = (hugepd_t *)pm;
+#else
+		hpdp = (hugepd_t *)pu;
+#endif
+	}
+
+	if (! hpdp)
+		return NULL;
+
+	if (hugepd_none(*hpdp) && __hugepte_alloc(mm, hpdp, addr))
+		return NULL;
+
+	return hugepte_offset(hpdp, addr);
+}
+
+static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp)
+{
+	pte_t *hugepte = hugepd_page(*hpdp);
+
+	hpdp->pd = 0;
+	tlb->need_flush = 1;
+	pgtable_free_tlb(tlb, pgtable_free_cache(hugepte, HUGEPTE_CACHE_NUM,
+						 HUGEPTE_TABLE_SIZE-1));
+}
+
 #ifdef CONFIG_PPC_64K_PAGES
-			/* See comment in huge_pte_offset. Note that if we ever
-			 * want to put the page size in the PMD, we would have
-			 * to open code our own pte_alloc* function in order
-			 * to populate and set the size atomically
-			 */
-			pt = pte_alloc_map(mm, pm, addr);
-#else /* CONFIG_PPC_64K_PAGES */
-			pt = (pte_t *)pm;
-#endif /* CONFIG_PPC_64K_PAGES */
-			return pt;
-		}
+static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
+				   unsigned long addr, unsigned long end,
+				   unsigned long floor, unsigned long ceiling)
+{
+	pmd_t *pmd;
+	unsigned long next;
+	unsigned long start;
+
+	start = addr;
+	pmd = pmd_offset(pud, addr);
+	do {
+		next = pmd_addr_end(addr, end);
+		if (pmd_none(*pmd))
+			continue;
+		free_hugepte_range(tlb, (hugepd_t *)pmd);
+	} while (pmd++, addr = next, addr != end);
+
+	start &= PUD_MASK;
+	if (start < floor)
+		return;
+	if (ceiling) {
+		ceiling &= PUD_MASK;
+		if (!ceiling)
+			return;
 	}
+	if (end - 1 > ceiling - 1)
+		return;
 
-	return NULL;
+	pmd = pmd_offset(pud, start);
+	pud_clear(pud);
+	pmd_free_tlb(tlb, pmd);
+}
+#endif
+
+static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
+				   unsigned long addr, unsigned long end,
+				   unsigned long floor, unsigned long ceiling)
+{
+	pud_t *pud;
+	unsigned long next;
+	unsigned long start;
+
+	start = addr;
+	pud = pud_offset(pgd, addr);
+	do {
+		next = pud_addr_end(addr, end);
+#ifdef CONFIG_PPC_64K_PAGES
+		if (pud_none_or_clear_bad(pud))
+			continue;
+		hugetlb_free_pmd_range(tlb, pud, addr, next, floor, ceiling);
+#else
+		if (pud_none(*pud))
+			continue;
+		free_hugepte_range(tlb, (hugepd_t *)pud);
+#endif
+	} while (pud++, addr = next, addr != end);
+
+	start &= PGDIR_MASK;
+	if (start < floor)
+		return;
+	if (ceiling) {
+		ceiling &= PGDIR_MASK;
+		if (!ceiling)
+			return;
+	}
+	if (end - 1 > ceiling - 1)
+		return;
+
+	pud = pud_offset(pgd, start);
+	pgd_clear(pgd);
+	pud_free_tlb(tlb, pud);
+}
+
+/*
+ * This function frees user-level page tables of a process.
+ *
+ * Must be called with pagetable lock held.
+ */
+void hugetlb_free_pgd_range(struct mmu_gather **tlb,
+			    unsigned long addr, unsigned long end,
+			    unsigned long floor, unsigned long ceiling)
+{
+	pgd_t *pgd;
+	unsigned long next;
+	unsigned long start;
+
+	/*
+	 * Comments below take from the normal free_pgd_range().  They
+	 * apply here too.  The tests against HUGEPD_MASK below are
+	 * essential, because we *don't* test for this at the bottom
+	 * level.  Without them we'll attempt to free a hugepte table
+	 * when we unmap just part of it, even if there are other
+	 * active mappings using it.
+	 *
+	 * The next few lines have given us lots of grief...
+	 *
+	 * Why are we testing HUGEPD* at this top level?  Because
+	 * often there will be no work to do at all, and we'd prefer
+	 * not to go all the way down to the bottom just to discover
+	 * that.
+	 *
+	 * Why all these "- 1"s?  Because 0 represents both the bottom
+	 * of the address space and the top of it (using -1 for the
+	 * top wouldn't help much: the masks would do the wrong thing).
+	 * The rule is that addr 0 and floor 0 refer to the bottom of
+	 * the address space, but end 0 and ceiling 0 refer to the top
+	 * Comparisons need to use "end - 1" and "ceiling - 1" (though
+	 * that end 0 case should be mythical).
+	 *
+	 * Wherever addr is brought up or ceiling brought down, we
+	 * must be careful to reject "the opposite 0" before it
+	 * confuses the subsequent tests.  But what about where end is
+	 * brought down by HUGEPD_SIZE below? no, end can't go down to
+	 * 0 there.
+	 *
+	 * Whereas we round start (addr) and ceiling down, by different
+	 * masks at different levels, in order to test whether a table
+	 * now has no other vmas using it, so can be freed, we don't
+	 * bother to round floor or end up - the tests don't need that.
+	 */
+
+	addr &= HUGEPD_MASK;
+	if (addr < floor) {
+		addr += HUGEPD_SIZE;
+		if (!addr)
+			return;
+	}
+	if (ceiling) {
+		ceiling &= HUGEPD_MASK;
+		if (!ceiling)
+			return;
+	}
+	if (end - 1 > ceiling - 1)
+		end -= HUGEPD_SIZE;
+	if (addr > end - 1)
+		return;
+
+	start = addr;
+	pgd = pgd_offset((*tlb)->mm, addr);
+	do {
+		BUG_ON(! in_hugepage_area((*tlb)->mm->context, addr));
+		next = pgd_addr_end(addr, end);
+		if (pgd_none_or_clear_bad(pgd))
+			continue;
+		hugetlb_free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
+	} while (pgd++, addr = next, addr != end);
 }
 
 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
@@ -841,3 +1040,27 @@ repeat:
  out:
 	return err;
 }
+
+static void zero_ctor(void *addr, kmem_cache_t *cache, unsigned long flags)
+{
+	memset(addr, 0, kmem_cache_size(cache));
+}
+
+static int __init hugetlbpage_init(void)
+{
+	if (!cpu_has_feature(CPU_FTR_16M_PAGE))
+		return -ENODEV;
+
+	huge_pgtable_cache = kmem_cache_create("hugepte_cache",
+					       HUGEPTE_TABLE_SIZE,
+					       HUGEPTE_TABLE_SIZE,
+					       SLAB_HWCACHE_ALIGN |
+					       SLAB_MUST_HWCACHE_ALIGN,
+					       zero_ctor, NULL);
+	if (! huge_pgtable_cache)
+		panic("hugetlbpage_init(): could not create hugepte cache\n");
+
+	return 0;
+}
+
+module_init(hugetlbpage_init);
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index babebd15bd..9e30f968c1 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -162,7 +162,14 @@ static const char *pgtable_cache_name[ARRAY_SIZE(pgtable_cache_size)] = {
 };
 #endif /* CONFIG_PPC_64K_PAGES */
 
+#ifdef CONFIG_HUGETLB_PAGE
+/* Hugepages need one extra cache, initialized in hugetlbpage.c.  We
+ * can't put into the tables above, because HPAGE_SHIFT is not compile
+ * time constant. */
+kmem_cache_t *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)+1];
+#else
 kmem_cache_t *pgtable_cache[ARRAY_SIZE(pgtable_cache_size)];
+#endif
 
 void pgtable_cache_init(void)
 {
diff --git a/include/asm-powerpc/page_64.h b/include/asm-powerpc/page_64.h
index 3fb061bab9..eab779c219 100644
--- a/include/asm-powerpc/page_64.h
+++ b/include/asm-powerpc/page_64.h
@@ -101,6 +101,7 @@ extern unsigned int HPAGE_SHIFT;
 		                      - (1U << GET_HTLB_AREA(addr))) & 0xffff)
 
 #define ARCH_HAS_HUGEPAGE_ONLY_RANGE
+#define ARCH_HAS_HUGETLB_FREE_PGD_RANGE
 #define ARCH_HAS_PREPARE_HUGEPAGE_RANGE
 #define ARCH_HAS_SETCLEAR_HUGE_PTE
 
diff --git a/include/asm-powerpc/pgalloc.h b/include/asm-powerpc/pgalloc.h
index a00ee002cd..9f0917c686 100644
--- a/include/asm-powerpc/pgalloc.h
+++ b/include/asm-powerpc/pgalloc.h
@@ -17,11 +17,13 @@ extern kmem_cache_t *pgtable_cache[];
 #define PTE_CACHE_NUM	0
 #define PMD_CACHE_NUM	1
 #define PGD_CACHE_NUM	2
+#define HUGEPTE_CACHE_NUM 3
 #else
 #define PTE_CACHE_NUM	0
 #define PMD_CACHE_NUM	1
 #define PUD_CACHE_NUM	1
 #define PGD_CACHE_NUM	0
+#define HUGEPTE_CACHE_NUM 2
 #endif
 
 /*
-- 
2.39.5