X-Git-Url: https://err.no/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=mm%2Fslub.c;h=f426f9bc644b992fd12ec535ac8510b93682fe0c;hb=6b0c880dfefecedb9ad353014ed41505c32aca82;hp=968ce3776e083f2b4651ad0efc6ccadd2a9ba772;hpb=f64dc58c5412233d4d44b0275eaebdc11bde23b3;p=linux-2.6 diff --git a/mm/slub.c b/mm/slub.c index 968ce3776e..f426f9bc64 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -90,7 +90,7 @@ * One use of this flag is to mark slabs that are * used for allocations. Then such a slab becomes a cpu * slab. The cpu slab may be equipped with an additional - * lockless_freelist that allows lockless access to + * freelist that allows lockless access to * free objects in addition to the regular freelist * that requires the slab lock. * @@ -140,11 +140,6 @@ static inline void ClearSlabDebug(struct page *page) /* * Issues still to be resolved: * - * - The per cpu array is updated for each new slab and and is a remote - * cacheline for most nodes. This could become a bouncing cacheline given - * enough frequent updates. There are 16 pointers in a cacheline, so at - * max 16 cpus could compete for the cacheline which may be okay. - * * - Support PAGE_ALLOC_DEBUG. Should be easy to do. * * - Variable sizing of the per node arrays @@ -205,11 +200,6 @@ static inline void ClearSlabDebug(struct page *page) #define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) #endif -/* - * The page->inuse field is 16 bit thus we have this limitation - */ -#define MAX_OBJECTS_PER_SLAB 65535 - /* Internal SLUB flags */ #define __OBJECT_POISON 0x80000000 /* Poison object */ #define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */ @@ -277,6 +267,15 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) #endif } +static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu) +{ +#ifdef CONFIG_SMP + return s->cpu_slab[cpu]; +#else + return &s->cpu_slab; +#endif +} + static inline int check_valid_pointer(struct kmem_cache *s, struct page *page, const void *object) { @@ -729,11 +728,6 @@ static int check_slab(struct kmem_cache *s, struct page *page) slab_err(s, page, "Not a valid slab page"); return 0; } - if (page->offset * sizeof(void *) != s->offset) { - slab_err(s, page, "Corrupted offset %lu", - (unsigned long)(page->offset * sizeof(void *))); - return 0; - } if (page->inuse > s->objects) { slab_err(s, page, "inuse %u > max %u", s->name, page->inuse, s->objects); @@ -872,8 +866,6 @@ bad: slab_fix(s, "Marking all objects used"); page->inuse = s->objects; page->freelist = NULL; - /* Fix up fields that may be corrupted */ - page->offset = s->offset / sizeof(void *); } return 0; } @@ -1055,6 +1047,9 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) if (s->flags & SLAB_CACHE_DMA) flags |= SLUB_DMA; + if (s->flags & SLAB_RECLAIM_ACCOUNT) + flags |= __GFP_RECLAIMABLE; + if (node == -1) page = alloc_pages(flags, s->order); else @@ -1088,19 +1083,19 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) void *last; void *p; - BUG_ON(flags & ~(GFP_DMA | __GFP_ZERO | GFP_LEVEL_MASK)); + BUG_ON(flags & GFP_SLAB_BUG_MASK); if (flags & __GFP_WAIT) local_irq_enable(); - page = allocate_slab(s, flags & GFP_LEVEL_MASK, node); + page = allocate_slab(s, + flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); if (!page) goto out; n = get_node(s, page_to_nid(page)); if (n) atomic_long_inc(&n->nr_slabs); - page->offset = s->offset / sizeof(void *); page->slab = s; page->flags |= 1 << PG_slab; if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | @@ -1123,7 +1118,6 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) set_freepointer(s, last, NULL); page->freelist = start; - page->lockless_freelist = NULL; page->inuse = 0; out: if (flags & __GFP_WAIT) @@ -1149,7 +1143,6 @@ static void __free_slab(struct kmem_cache *s, struct page *page) NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, - pages); - page->mapping = NULL; __free_pages(page, s->order); } @@ -1383,33 +1376,34 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page) /* * Remove the cpu slab */ -static void deactivate_slab(struct kmem_cache *s, struct page *page, int cpu) +static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) { + struct page *page = c->page; /* * Merge cpu freelist into freelist. Typically we get here * because both freelists are empty. So this is unlikely * to occur. */ - while (unlikely(page->lockless_freelist)) { + while (unlikely(c->freelist)) { void **object; /* Retrieve object from cpu_freelist */ - object = page->lockless_freelist; - page->lockless_freelist = page->lockless_freelist[page->offset]; + object = c->freelist; + c->freelist = c->freelist[c->offset]; /* And put onto the regular freelist */ - object[page->offset] = page->freelist; + object[c->offset] = page->freelist; page->freelist = object; page->inuse--; } - s->cpu_slab[cpu] = NULL; + c->page = NULL; unfreeze_slab(s, page); } -static inline void flush_slab(struct kmem_cache *s, struct page *page, int cpu) +static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) { - slab_lock(page); - deactivate_slab(s, page, cpu); + slab_lock(c->page); + deactivate_slab(s, c); } /* @@ -1418,18 +1412,17 @@ static inline void flush_slab(struct kmem_cache *s, struct page *page, int cpu) */ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { - struct page *page = s->cpu_slab[cpu]; + struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); - if (likely(page)) - flush_slab(s, page, cpu); + if (likely(c && c->page)) + flush_slab(s, c); } static void flush_cpu_slab(void *d) { struct kmem_cache *s = d; - int cpu = smp_processor_id(); - __flush_cpu_slab(s, cpu); + __flush_cpu_slab(s, smp_processor_id()); } static void flush_all(struct kmem_cache *s) @@ -1445,6 +1438,19 @@ static void flush_all(struct kmem_cache *s) #endif } +/* + * Check if the objects in a per cpu structure fit numa + * locality expectations. + */ +static inline int node_match(struct kmem_cache_cpu *c, int node) +{ +#ifdef CONFIG_NUMA + if (node != -1 && c->node != node) + return 0; +#endif + return 1; +} + /* * Slow path. The lockless freelist is empty or we need to perform * debugging duties. @@ -1463,45 +1469,46 @@ static void flush_all(struct kmem_cache *s) * we need to allocate a new slab. This is slowest path since we may sleep. */ static void *__slab_alloc(struct kmem_cache *s, - gfp_t gfpflags, int node, void *addr, struct page *page) + gfp_t gfpflags, int node, void *addr, struct kmem_cache_cpu *c) { void **object; - int cpu = smp_processor_id(); + struct page *new; - if (!page) + if (!c->page) goto new_slab; - slab_lock(page); - if (unlikely(node != -1 && page_to_nid(page) != node)) + slab_lock(c->page); + if (unlikely(!node_match(c, node))) goto another_slab; load_freelist: - object = page->freelist; + object = c->page->freelist; if (unlikely(!object)) goto another_slab; - if (unlikely(SlabDebug(page))) + if (unlikely(SlabDebug(c->page))) goto debug; - object = page->freelist; - page->lockless_freelist = object[page->offset]; - page->inuse = s->objects; - page->freelist = NULL; - slab_unlock(page); + object = c->page->freelist; + c->freelist = object[c->offset]; + c->page->inuse = s->objects; + c->page->freelist = NULL; + c->node = page_to_nid(c->page); + slab_unlock(c->page); return object; another_slab: - deactivate_slab(s, page, cpu); + deactivate_slab(s, c); new_slab: - page = get_partial(s, gfpflags, node); - if (page) { - s->cpu_slab[cpu] = page; + new = get_partial(s, gfpflags, node); + if (new) { + c->page = new; goto load_freelist; } - page = new_slab(s, gfpflags, node); - if (page) { - cpu = smp_processor_id(); - if (s->cpu_slab[cpu]) { + new = new_slab(s, gfpflags, node); + if (new) { + c = get_cpu_slab(s, smp_processor_id()); + if (c->page) { /* * Someone else populated the cpu_slab while we * enabled interrupts, or we have gotten scheduled @@ -1509,34 +1516,33 @@ new_slab: * requested node even if __GFP_THISNODE was * specified. So we need to recheck. */ - if (node == -1 || - page_to_nid(s->cpu_slab[cpu]) == node) { + if (node_match(c, node)) { /* * Current cpuslab is acceptable and we * want the current one since its cache hot */ - discard_slab(s, page); - page = s->cpu_slab[cpu]; - slab_lock(page); + discard_slab(s, new); + slab_lock(c->page); goto load_freelist; } /* New slab does not fit our expectations */ - flush_slab(s, s->cpu_slab[cpu], cpu); + flush_slab(s, c); } - slab_lock(page); - SetSlabFrozen(page); - s->cpu_slab[cpu] = page; + slab_lock(new); + SetSlabFrozen(new); + c->page = new; goto load_freelist; } return NULL; debug: - object = page->freelist; - if (!alloc_debug_processing(s, page, object, addr)) + object = c->page->freelist; + if (!alloc_debug_processing(s, c->page, object, addr)) goto another_slab; - page->inuse++; - page->freelist = object[page->offset]; - slab_unlock(page); + c->page->inuse++; + c->page->freelist = object[c->offset]; + c->node = -1; + slab_unlock(c->page); return object; } @@ -1553,25 +1559,24 @@ debug: static void __always_inline *slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, void *addr) { - struct page *page; void **object; unsigned long flags; + struct kmem_cache_cpu *c; local_irq_save(flags); - page = s->cpu_slab[smp_processor_id()]; - if (unlikely(!page || !page->lockless_freelist || - (node != -1 && page_to_nid(page) != node))) + c = get_cpu_slab(s, smp_processor_id()); + if (unlikely(!c->freelist || !node_match(c, node))) - object = __slab_alloc(s, gfpflags, node, addr, page); + object = __slab_alloc(s, gfpflags, node, addr, c); else { - object = page->lockless_freelist; - page->lockless_freelist = object[page->offset]; + object = c->freelist; + c->freelist = object[c->offset]; } local_irq_restore(flags); if (unlikely((gfpflags & __GFP_ZERO) && object)) - memset(object, 0, s->objsize); + memset(object, 0, c->objsize); return object; } @@ -1599,7 +1604,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_node); * handling required then we can return immediately. */ static void __slab_free(struct kmem_cache *s, struct page *page, - void *x, void *addr) + void *x, void *addr, unsigned int offset) { void *prior; void **object = (void *)x; @@ -1609,7 +1614,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, if (unlikely(SlabDebug(page))) goto debug; checks_ok: - prior = object[page->offset] = page->freelist; + prior = object[offset] = page->freelist; page->freelist = object; page->inuse--; @@ -1664,15 +1669,16 @@ static void __always_inline slab_free(struct kmem_cache *s, { void **object = (void *)x; unsigned long flags; + struct kmem_cache_cpu *c; local_irq_save(flags); debug_check_no_locks_freed(object, s->objsize); - if (likely(page == s->cpu_slab[smp_processor_id()] && - !SlabDebug(page))) { - object[page->offset] = page->lockless_freelist; - page->lockless_freelist = object; + c = get_cpu_slab(s, smp_processor_id()); + if (likely(page == c->page && c->node >= 0)) { + object[c->offset] = c->freelist; + c->freelist = object; } else - __slab_free(s, page, x, addr); + __slab_free(s, page, x, addr, c->offset); local_irq_restore(flags); } @@ -1759,14 +1765,6 @@ static inline int slab_order(int size, int min_objects, int rem; int min_order = slub_min_order; - /* - * If we would create too many object per slab then reduce - * the slab order even if it goes below slub_min_order. - */ - while (min_order > 0 && - (PAGE_SIZE << min_order) >= MAX_OBJECTS_PER_SLAB * size) - min_order--; - for (order = max(min_order, fls(min_objects * size - 1) - PAGE_SHIFT); order <= max_order; order++) { @@ -1781,9 +1779,6 @@ static inline int slab_order(int size, int min_objects, if (rem <= slab_size / fract_leftover) break; - /* If the next size is too high then exit now */ - if (slab_size * 2 >= MAX_OBJECTS_PER_SLAB * size) - break; } return order; @@ -1858,6 +1853,16 @@ static unsigned long calculate_alignment(unsigned long flags, return ALIGN(align, sizeof(void *)); } +static void init_kmem_cache_cpu(struct kmem_cache *s, + struct kmem_cache_cpu *c) +{ + c->page = NULL; + c->freelist = NULL; + c->node = 0; + c->offset = s->offset / sizeof(void *); + c->objsize = s->objsize; +} + static void init_kmem_cache_node(struct kmem_cache_node *n) { n->nr_partial = 0; @@ -1869,6 +1874,131 @@ static void init_kmem_cache_node(struct kmem_cache_node *n) #endif } +#ifdef CONFIG_SMP +/* + * Per cpu array for per cpu structures. + * + * The per cpu array places all kmem_cache_cpu structures from one processor + * close together meaning that it becomes possible that multiple per cpu + * structures are contained in one cacheline. This may be particularly + * beneficial for the kmalloc caches. + * + * A desktop system typically has around 60-80 slabs. With 100 here we are + * likely able to get per cpu structures for all caches from the array defined + * here. We must be able to cover all kmalloc caches during bootstrap. + * + * If the per cpu array is exhausted then fall back to kmalloc + * of individual cachelines. No sharing is possible then. + */ +#define NR_KMEM_CACHE_CPU 100 + +static DEFINE_PER_CPU(struct kmem_cache_cpu, + kmem_cache_cpu)[NR_KMEM_CACHE_CPU]; + +static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free); +static cpumask_t kmem_cach_cpu_free_init_once = CPU_MASK_NONE; + +static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s, + int cpu, gfp_t flags) +{ + struct kmem_cache_cpu *c = per_cpu(kmem_cache_cpu_free, cpu); + + if (c) + per_cpu(kmem_cache_cpu_free, cpu) = + (void *)c->freelist; + else { + /* Table overflow: So allocate ourselves */ + c = kmalloc_node( + ALIGN(sizeof(struct kmem_cache_cpu), cache_line_size()), + flags, cpu_to_node(cpu)); + if (!c) + return NULL; + } + + init_kmem_cache_cpu(s, c); + return c; +} + +static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu) +{ + if (c < per_cpu(kmem_cache_cpu, cpu) || + c > per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) { + kfree(c); + return; + } + c->freelist = (void *)per_cpu(kmem_cache_cpu_free, cpu); + per_cpu(kmem_cache_cpu_free, cpu) = c; +} + +static void free_kmem_cache_cpus(struct kmem_cache *s) +{ + int cpu; + + for_each_online_cpu(cpu) { + struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); + + if (c) { + s->cpu_slab[cpu] = NULL; + free_kmem_cache_cpu(c, cpu); + } + } +} + +static int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) +{ + int cpu; + + for_each_online_cpu(cpu) { + struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); + + if (c) + continue; + + c = alloc_kmem_cache_cpu(s, cpu, flags); + if (!c) { + free_kmem_cache_cpus(s); + return 0; + } + s->cpu_slab[cpu] = c; + } + return 1; +} + +/* + * Initialize the per cpu array. + */ +static void init_alloc_cpu_cpu(int cpu) +{ + int i; + + if (cpu_isset(cpu, kmem_cach_cpu_free_init_once)) + return; + + for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--) + free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu); + + cpu_set(cpu, kmem_cach_cpu_free_init_once); +} + +static void __init init_alloc_cpu(void) +{ + int cpu; + + for_each_online_cpu(cpu) + init_alloc_cpu_cpu(cpu); + } + +#else +static inline void free_kmem_cache_cpus(struct kmem_cache *s) {} +static inline void init_alloc_cpu(void) {} + +static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) +{ + init_kmem_cache_cpu(s, &s->cpu_slab); + return 1; +} +#endif + #ifdef CONFIG_NUMA /* * No kmalloc_node yet so do it by hand. We know that this is the first @@ -1876,7 +2006,8 @@ static void init_kmem_cache_node(struct kmem_cache_node *n) * possible. * * Note that this function only works on the kmalloc_node_cache - * when allocating for the kmalloc_node_cache. + * when allocating for the kmalloc_node_cache. This is used for bootstrapping + * memory on a fresh node that has no slab structures yet. */ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags, int node) @@ -2077,14 +2208,7 @@ static int calculate_sizes(struct kmem_cache *s) */ s->objects = (PAGE_SIZE << s->order) / size; - /* - * Verify that the number of objects is within permitted limits. - * The page->inuse field is only 16 bit wide! So we cannot have - * more than 64k objects per slab. - */ - if (!s->objects || s->objects > MAX_OBJECTS_PER_SLAB) - return 0; - return 1; + return !!s->objects; } @@ -2107,9 +2231,12 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, #ifdef CONFIG_NUMA s->defrag_ratio = 100; #endif + if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) + goto error; - if (init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) + if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA)) return 1; + free_kmem_cache_nodes(s); error: if (flags & SLAB_PANIC) panic("Cannot create slab %s size=%lu realsize=%u " @@ -2192,6 +2319,7 @@ static inline int kmem_cache_close(struct kmem_cache *s) flush_all(s); /* Attempt to free all objects */ + free_kmem_cache_cpus(s); for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = get_node(s, node); @@ -2579,6 +2707,8 @@ void __init kmem_cache_init(void) int i; int caches = 0; + init_alloc_cpu(); + #ifdef CONFIG_NUMA /* * Must first have the slab cache available for the allocations of the @@ -2639,10 +2769,12 @@ void __init kmem_cache_init(void) #ifdef CONFIG_SMP register_cpu_notifier(&slab_notifier); + kmem_size = offsetof(struct kmem_cache, cpu_slab) + + nr_cpu_ids * sizeof(struct kmem_cache_cpu *); +#else + kmem_size = sizeof(struct kmem_cache); #endif - kmem_size = offsetof(struct kmem_cache, cpu_slab) + - nr_cpu_ids * sizeof(struct page *); printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," " CPUs=%d, Nodes=%d\n", @@ -2721,12 +2853,21 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, down_write(&slub_lock); s = find_mergeable(size, align, flags, name, ctor); if (s) { + int cpu; + s->refcount++; /* * Adjust the object sizes so that we clear * the complete object on kzalloc. */ s->objsize = max(s->objsize, (int)size); + + /* + * And then we need to update the object size in the + * per cpu structures + */ + for_each_online_cpu(cpu) + get_cpu_slab(s, cpu)->objsize = s->objsize; s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); up_write(&slub_lock); if (sysfs_slab_alias(s, name)) @@ -2769,15 +2910,29 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, unsigned long flags; switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + init_alloc_cpu_cpu(cpu); + down_read(&slub_lock); + list_for_each_entry(s, &slab_caches, list) + s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu, + GFP_KERNEL); + up_read(&slub_lock); + break; + case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: down_read(&slub_lock); list_for_each_entry(s, &slab_caches, list) { + struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); + local_irq_save(flags); __flush_cpu_slab(s, cpu); local_irq_restore(flags); + free_kmem_cache_cpu(c, cpu); + s->cpu_slab[cpu] = NULL; } up_read(&slub_lock); break; @@ -3130,7 +3285,7 @@ static int list_locations(struct kmem_cache *s, char *buf, int node; if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), - GFP_KERNEL)) + GFP_TEMPORARY)) return sprintf(buf, "Out of memory\n"); /* Push back cpu slabs */ @@ -3244,11 +3399,18 @@ static unsigned long slab_objects(struct kmem_cache *s, per_cpu = nodes + nr_node_ids; for_each_possible_cpu(cpu) { - struct page *page = s->cpu_slab[cpu]; + struct page *page; int node; + struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); + if (!c) + continue; + + page = c->page; + node = c->node; + if (node < 0) + continue; if (page) { - node = page_to_nid(page); if (flags & SO_CPU) { int x = 0; @@ -3305,13 +3467,19 @@ static int any_slab_objects(struct kmem_cache *s) int node; int cpu; - for_each_possible_cpu(cpu) - if (s->cpu_slab[cpu]) + for_each_possible_cpu(cpu) { + struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); + + if (c && c->page) return 1; + } - for_each_node(node) { + for_each_online_node(node) { struct kmem_cache_node *n = get_node(s, node); + if (!n) + continue; + if (n->nr_partial || atomic_long_read(&n->nr_slabs)) return 1; }