hugetlb reservations: fix hugetlb MAP_PRIVATE reservations across vma splits

author Andy Whitcroft <apw@shadowen.org>

Thu, 24 Jul 2008 04:27:32 +0000 (21:27 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 24 Jul 2008 17:47:16 +0000 (10:47 -0700)
author Andy Whitcroft <apw@shadowen.org>
Thu, 24 Jul 2008 04:27:32 +0000 (21:27 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 24 Jul 2008 17:47:16 +0000 (10:47 -0700)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index 72acbb29d2cc68ca50161af90d3cbe042c9852aa..65616941a383a3304bbccdd50a1858b112195534 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -43,6 +43,16 @@ static DEFINE_SPINLOCK(hugetlb_lock);
  /*
   * Region tracking -- allows tracking of reservations and instantiated pages
   *                    across the pages in a mapping.
+ *
+ * The region data structures are protected by a combination of the mmap_sem
+ * and the hugetlb_instantion_mutex.  To access or modify a region the caller
+ * must either hold the mmap_sem for write, or the mmap_sem for read and
+ * the hugetlb_instantiation mutex:
+ *
+ *     down_write(&mm->mmap_sem);
+ * or
+ *     down_read(&mm->mmap_sem);
+ *     mutex_lock(&hugetlb_instantiation_mutex);
   */
  struct file_region {
         struct list_head link;
@@ -165,6 +175,30 @@ static long region_truncate(struct list_head *head, long end)
         return chg;
  }
  
+static long region_count(struct list_head *head, long f, long t)
+{
+       struct file_region *rg;
+       long chg = 0;
+
+       /* Locate each segment we overlap with, and count that overlap. */
+       list_for_each_entry(rg, head, link) {
+               int seg_from;
+               int seg_to;
+
+               if (rg->to <= f)
+                       continue;
+               if (rg->from >= t)
+                       break;
+
+               seg_from = max(rg->from, f);
+               seg_to = min(rg->to, t);
+
+               chg += seg_to - seg_from;
+       }
+
+       return chg;
+}
+
  /*
   * Convert the address within this vma to the page offset within
   * the mapping, in base page units.
@@ -187,9 +221,15 @@ static pgoff_t vma_pagecache_offset(struct vm_area_struct *vma,
                         (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
  }
  
-#define HPAGE_RESV_OWNER    (1UL << (BITS_PER_LONG - 1))
-#define HPAGE_RESV_UNMAPPED (1UL << (BITS_PER_LONG - 2))
+/*
+ * Flags for MAP_PRIVATE reservations.  These are stored in the bottom
+ * bits of the reservation map pointer, which are always clear due to
+ * alignment.
+ */
+#define HPAGE_RESV_OWNER    (1UL << 0)
+#define HPAGE_RESV_UNMAPPED (1UL << 1)
  #define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
+
  /*
   * These helpers are used to track how many pages are reserved for
   * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
@@ -199,6 +239,15 @@ static pgoff_t vma_pagecache_offset(struct vm_area_struct *vma,
   * the reserve counters are updated with the hugetlb_lock held. It is safe
   * to reset the VMA at fork() time as it is not in use yet and there is no
   * chance of the global counters getting corrupted as a result of the values.
+ *
+ * The private mapping reservation is represented in a subtly different
+ * manner to a shared mapping.  A shared mapping has a region map associated
+ * with the underlying file, this region map represents the backing file
+ * pages which have ever had a reservation assigned which this persists even
+ * after the page is instantiated.  A private mapping has a region map
+ * associated with the original mmap which is attached to all VMAs which
+ * reference it, this region map represents those offsets which have consumed
+ * reservation ie. where pages have been instantiated.
   */
  static unsigned long get_vma_private_data(struct vm_area_struct *vma)
  {
@@ -211,22 +260,48 @@ static void set_vma_private_data(struct vm_area_struct *vma,
         vma->vm_private_data = (void *)value;
  }
  
-static unsigned long vma_resv_huge_pages(struct vm_area_struct *vma)
+struct resv_map {
+       struct kref refs;
+       struct list_head regions;
+};
+
+struct resv_map *resv_map_alloc(void)
+{
+       struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
+       if (!resv_map)
+               return NULL;
+
+       kref_init(&resv_map->refs);
+       INIT_LIST_HEAD(&resv_map->regions);
+
+       return resv_map;
+}
+
+void resv_map_release(struct kref *ref)
+{
+       struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
+
+       /* Clear out any active regions before we release the map. */
+       region_truncate(&resv_map->regions, 0);
+       kfree(resv_map);
+}
+
+static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
  {
         VM_BUG_ON(!is_vm_hugetlb_page(vma));
         if (!(vma->vm_flags & VM_SHARED))
-               return get_vma_private_data(vma) & ~HPAGE_RESV_MASK;
+               return (struct resv_map *)(get_vma_private_data(vma) &
+                                                       ~HPAGE_RESV_MASK);
         return 0;
  }
  
-static void set_vma_resv_huge_pages(struct vm_area_struct *vma,
-                                                       unsigned long reserve)
+static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
  {
         VM_BUG_ON(!is_vm_hugetlb_page(vma));
         VM_BUG_ON(vma->vm_flags & VM_SHARED);
  
-       set_vma_private_data(vma,
-               (get_vma_private_data(vma) & HPAGE_RESV_MASK) | reserve);
+       set_vma_private_data(vma, (get_vma_private_data(vma) &
+                               HPAGE_RESV_MASK) | (unsigned long)map);
  }
  
  static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
@@ -253,19 +328,12 @@ static void decrement_hugepage_resv_vma(struct vm_area_struct *vma)
         if (vma->vm_flags & VM_SHARED) {
                 /* Shared mappings always use reserves */
                 resv_huge_pages--;
-       } else {
+       } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
                 /*
                  * Only the process that called mmap() has reserves for
                  * private mappings.
                  */
-               if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
-                       unsigned long flags, reserve;
-                       resv_huge_pages--;
-                       flags = (unsigned long)vma->vm_private_data &
-                                                       HPAGE_RESV_MASK;
-                       reserve = (unsigned long)vma->vm_private_data - 1;
-                       vma->vm_private_data = (void *)(reserve | flags);
-               }
+               resv_huge_pages--;
         }
  }
  
@@ -282,7 +350,7 @@ static int vma_has_private_reserves(struct vm_area_struct *vma)
  {
         if (vma->vm_flags & VM_SHARED)
                 return 0;
-       if (!vma_resv_huge_pages(vma))
+       if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER))
                 return 0;
         return 1;
  }
@@ -742,12 +810,19 @@ static int vma_needs_reservation(struct vm_area_struct *vma, unsigned long addr)
                 return region_chg(&inode->i_mapping->private_list,
                                                         idx, idx + 1);
  
-       } else {
-               if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER))
-                       return 1;
-       }
+       } else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+               return 1;
  
-       return 0;
+       } else  {
+               int err;
+               pgoff_t idx = vma_pagecache_offset(vma, addr);
+               struct resv_map *reservations = vma_resv_map(vma);
+
+               err = region_chg(&reservations->regions, idx, idx + 1);
+               if (err < 0)
+                       return err;
+               return 0;
+       }
  }
  static void vma_commit_reservation(struct vm_area_struct *vma,
                                                         unsigned long addr)
@@ -758,6 +833,13 @@ static void vma_commit_reservation(struct vm_area_struct *vma,
         if (vma->vm_flags & VM_SHARED) {
                 pgoff_t idx = vma_pagecache_offset(vma, addr);
                 region_add(&inode->i_mapping->private_list, idx, idx + 1);
+
+       } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+               pgoff_t idx = vma_pagecache_offset(vma, addr);
+               struct resv_map *reservations = vma_resv_map(vma);
+
+               /* Mark this page used in the map. */
+               region_add(&reservations->regions, idx, idx + 1);
         }
  }
  
@@ -1047,11 +1129,41 @@ out:
         return ret;
  }
  
+static void hugetlb_vm_op_open(struct vm_area_struct *vma)
+{
+       struct resv_map *reservations = vma_resv_map(vma);
+
+       /*
+        * This new VMA should share its siblings reservation map if present.
+        * The VMA will only ever have a valid reservation map pointer where
+        * it is being copied for another still existing VMA.  As that VMA
+        * has a reference to the reservation map it cannot dissappear until
+        * after this open call completes.  It is therefore safe to take a
+        * new reference here without additional locking.
+        */
+       if (reservations)
+               kref_get(&reservations->refs);
+}
+
  static void hugetlb_vm_op_close(struct vm_area_struct *vma)
  {
-       unsigned long reserve = vma_resv_huge_pages(vma);
-       if (reserve)
-               hugetlb_acct_memory(-reserve);
+       struct resv_map *reservations = vma_resv_map(vma);
+       unsigned long reserve;
+       unsigned long start;
+       unsigned long end;
+
+       if (reservations) {
+               start = vma_pagecache_offset(vma, vma->vm_start);
+               end = vma_pagecache_offset(vma, vma->vm_end);
+
+               reserve = (end - start) -
+                       region_count(&reservations->regions, start, end);
+
+               kref_put(&reservations->refs, resv_map_release);
+
+               if (reserve)
+                       hugetlb_acct_memory(-reserve);
+       }
  }
  
  /*
@@ -1068,6 +1180,7 @@ static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
  
  struct vm_operations_struct hugetlb_vm_ops = {
         .fault = hugetlb_vm_op_fault,
+       .open = hugetlb_vm_op_open,
         .close = hugetlb_vm_op_close,
  };
  
@@ -1617,8 +1730,13 @@ int hugetlb_reserve_pages(struct inode *inode,
         if (!vma || vma->vm_flags & VM_SHARED)
                 chg = region_chg(&inode->i_mapping->private_list, from, to);
         else {
+               struct resv_map *resv_map = resv_map_alloc();
+               if (!resv_map)
+                       return -ENOMEM;
+
                 chg = to - from;
-               set_vma_resv_huge_pages(vma, chg);
+
+               set_vma_resv_map(vma, resv_map);
                 set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
         }
author	Andy Whitcroft <apw@shadowen.org>
	Thu, 24 Jul 2008 04:27:32 +0000 (21:27 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 24 Jul 2008 17:47:16 +0000 (10:47 -0700)