X-Git-Url: https://err.no/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=mm%2Fvmstat.c;h=a2b6a9f96e5c6390625ec031763a74de9fa5f5b2;hb=1c0d04c9e44f4a248335c33d2be7c7f7b06ff359;hp=ad456202ff1a4ab28b0ab24e697c00d89efe5af6;hpb=f6ac2354d791195ca40822b84d73d48a4e8b7f2b;p=linux-2.6 diff --git a/mm/vmstat.c b/mm/vmstat.c index ad456202ff..a2b6a9f96e 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -3,145 +3,380 @@ * * Manages VM statistics * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + * + * zoned VM statistics + * Copyright (C) 2006 Silicon Graphics, Inc., + * Christoph Lameter */ #include #include +#include +#include + +void __get_zone_counts(unsigned long *active, unsigned long *inactive, + unsigned long *free, struct pglist_data *pgdat) +{ + struct zone *zones = pgdat->node_zones; + int i; + + *active = 0; + *inactive = 0; + *free = 0; + for (i = 0; i < MAX_NR_ZONES; i++) { + *active += zones[i].nr_active; + *inactive += zones[i].nr_inactive; + *free += zones[i].free_pages; + } +} + +void get_zone_counts(unsigned long *active, + unsigned long *inactive, unsigned long *free) +{ + struct pglist_data *pgdat; + + *active = 0; + *inactive = 0; + *free = 0; + for_each_online_pgdat(pgdat) { + unsigned long l, m, n; + __get_zone_counts(&l, &m, &n, pgdat); + *active += l; + *inactive += m; + *free += n; + } +} + +#ifdef CONFIG_VM_EVENT_COUNTERS +DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; +EXPORT_PER_CPU_SYMBOL(vm_event_states); + +static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask) +{ + int cpu = 0; + int i; + + memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long)); + + cpu = first_cpu(*cpumask); + while (cpu < NR_CPUS) { + struct vm_event_state *this = &per_cpu(vm_event_states, cpu); + + cpu = next_cpu(cpu, *cpumask); + + if (cpu < NR_CPUS) + prefetch(&per_cpu(vm_event_states, cpu)); + + + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) + ret[i] += this->event[i]; + } +} /* - * Accumulate the page_state information across all CPUs. + * Accumulate the vm event counters across all CPUs. * The result is unavoidably approximate - it can change * during and after execution of this function. +*/ +void all_vm_events(unsigned long *ret) +{ + sum_vm_events(ret, &cpu_online_map); +} +EXPORT_SYMBOL_GPL(all_vm_events); + +#ifdef CONFIG_HOTPLUG +/* + * Fold the foreign cpu events into our own. + * + * This is adding to the events on one processor + * but keeps the global counts constant. + */ +void vm_events_fold_cpu(int cpu) +{ + struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu); + int i; + + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { + count_vm_events(i, fold_state->event[i]); + fold_state->event[i] = 0; + } +} +#endif /* CONFIG_HOTPLUG */ + +#endif /* CONFIG_VM_EVENT_COUNTERS */ + +/* + * Manage combined zone based / global counters + * + * vm_stat contains the global counters */ -DEFINE_PER_CPU(struct page_state, page_states) = {0}; +atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; +EXPORT_SYMBOL(vm_stat); -atomic_t nr_pagecache = ATOMIC_INIT(0); -EXPORT_SYMBOL(nr_pagecache); #ifdef CONFIG_SMP -DEFINE_PER_CPU(long, nr_pagecache_local) = 0; -#endif -static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) +static int calculate_threshold(struct zone *zone) { - unsigned cpu; + int threshold; + int mem; /* memory in 128 MB units */ + + /* + * The threshold scales with the number of processors and the amount + * of memory per zone. More memory means that we can defer updates for + * longer, more processors could lead to more contention. + * fls() is used to have a cheap way of logarithmic scaling. + * + * Some sample thresholds: + * + * Threshold Processors (fls) Zonesize fls(mem+1) + * ------------------------------------------------------------------ + * 8 1 1 0.9-1 GB 4 + * 16 2 2 0.9-1 GB 4 + * 20 2 2 1-2 GB 5 + * 24 2 2 2-4 GB 6 + * 28 2 2 4-8 GB 7 + * 32 2 2 8-16 GB 8 + * 4 2 2 <128M 1 + * 30 4 3 2-4 GB 5 + * 48 4 3 8-16 GB 8 + * 32 8 4 1-2 GB 4 + * 32 8 4 0.9-1GB 4 + * 10 16 5 <128M 1 + * 40 16 5 900M 4 + * 70 64 7 2-4 GB 5 + * 84 64 7 4-8 GB 6 + * 108 512 9 4-8 GB 6 + * 125 1024 10 8-16 GB 8 + * 125 1024 10 16-32 GB 9 + */ + + mem = zone->present_pages >> (27 - PAGE_SHIFT); + + threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem)); + + /* + * Maximum threshold is 125 + */ + threshold = min(125, threshold); + + return threshold; +} - memset(ret, 0, nr * sizeof(unsigned long)); - cpus_and(*cpumask, *cpumask, cpu_online_map); +/* + * Refresh the thresholds for each zone. + */ +static void refresh_zone_stat_thresholds(void) +{ + struct zone *zone; + int cpu; + int threshold; - for_each_cpu_mask(cpu, *cpumask) { - unsigned long *in; - unsigned long *out; - unsigned off; - unsigned next_cpu; + for_each_zone(zone) { - in = (unsigned long *)&per_cpu(page_states, cpu); + if (!zone->present_pages) + continue; - next_cpu = next_cpu(cpu, *cpumask); - if (likely(next_cpu < NR_CPUS)) - prefetch(&per_cpu(page_states, next_cpu)); + threshold = calculate_threshold(zone); - out = (unsigned long *)ret; - for (off = 0; off < nr; off++) - *out++ += *in++; + for_each_online_cpu(cpu) + zone_pcp(zone, cpu)->stat_threshold = threshold; } } -void get_page_state_node(struct page_state *ret, int node) +/* + * For use when we know that interrupts are disabled. + */ +void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, + int delta) { - int nr; - cpumask_t mask = node_to_cpumask(node); + struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); + s8 *p = pcp->vm_stat_diff + item; + long x; - nr = offsetof(struct page_state, GET_PAGE_STATE_LAST); - nr /= sizeof(unsigned long); + x = delta + *p; - __get_page_state(ret, nr+1, &mask); + if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) { + zone_page_state_add(x, zone, item); + x = 0; + } + *p = x; } +EXPORT_SYMBOL(__mod_zone_page_state); -void get_page_state(struct page_state *ret) +/* + * For an unknown interrupt state + */ +void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, + int delta) { - int nr; - cpumask_t mask = CPU_MASK_ALL; - - nr = offsetof(struct page_state, GET_PAGE_STATE_LAST); - nr /= sizeof(unsigned long); + unsigned long flags; - __get_page_state(ret, nr + 1, &mask); + local_irq_save(flags); + __mod_zone_page_state(zone, item, delta); + local_irq_restore(flags); } +EXPORT_SYMBOL(mod_zone_page_state); -void get_full_page_state(struct page_state *ret) +/* + * Optimized increment and decrement functions. + * + * These are only for a single page and therefore can take a struct page * + * argument instead of struct zone *. This allows the inclusion of the code + * generated for page_zone(page) into the optimized functions. + * + * No overflow check is necessary and therefore the differential can be + * incremented or decremented in place which may allow the compilers to + * generate better code. + * The increment or decrement is known and therefore one boundary check can + * be omitted. + * + * NOTE: These functions are very performance sensitive. Change only + * with care. + * + * Some processors have inc/dec instructions that are atomic vs an interrupt. + * However, the code must first determine the differential location in a zone + * based on the processor number and then inc/dec the counter. There is no + * guarantee without disabling preemption that the processor will not change + * in between and therefore the atomicity vs. interrupt cannot be exploited + * in a useful way here. + */ +static void __inc_zone_state(struct zone *zone, enum zone_stat_item item) { - cpumask_t mask = CPU_MASK_ALL; + struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); + s8 *p = pcp->vm_stat_diff + item; + + (*p)++; + + if (unlikely(*p > pcp->stat_threshold)) { + int overstep = pcp->stat_threshold / 2; - __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask); + zone_page_state_add(*p + overstep, zone, item); + *p = -overstep; + } } -unsigned long read_page_state_offset(unsigned long offset) +void __inc_zone_page_state(struct page *page, enum zone_stat_item item) { - unsigned long ret = 0; - int cpu; + __inc_zone_state(page_zone(page), item); +} +EXPORT_SYMBOL(__inc_zone_page_state); - for_each_online_cpu(cpu) { - unsigned long in; +void __dec_zone_page_state(struct page *page, enum zone_stat_item item) +{ + struct zone *zone = page_zone(page); + struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); + s8 *p = pcp->vm_stat_diff + item; + + (*p)--; + + if (unlikely(*p < - pcp->stat_threshold)) { + int overstep = pcp->stat_threshold / 2; - in = (unsigned long)&per_cpu(page_states, cpu) + offset; - ret += *((unsigned long *)in); + zone_page_state_add(*p - overstep, zone, item); + *p = overstep; } - return ret; +} +EXPORT_SYMBOL(__dec_zone_page_state); + +void inc_zone_state(struct zone *zone, enum zone_stat_item item) +{ + unsigned long flags; + + local_irq_save(flags); + __inc_zone_state(zone, item); + local_irq_restore(flags); } -void __mod_page_state_offset(unsigned long offset, unsigned long delta) +void inc_zone_page_state(struct page *page, enum zone_stat_item item) { - void *ptr; + unsigned long flags; + struct zone *zone; - ptr = &__get_cpu_var(page_states); - *(unsigned long *)(ptr + offset) += delta; + zone = page_zone(page); + local_irq_save(flags); + __inc_zone_state(zone, item); + local_irq_restore(flags); } -EXPORT_SYMBOL(__mod_page_state_offset); +EXPORT_SYMBOL(inc_zone_page_state); -void mod_page_state_offset(unsigned long offset, unsigned long delta) +void dec_zone_page_state(struct page *page, enum zone_stat_item item) { unsigned long flags; - void *ptr; local_irq_save(flags); - ptr = &__get_cpu_var(page_states); - *(unsigned long *)(ptr + offset) += delta; + __dec_zone_page_state(page, item); local_irq_restore(flags); } -EXPORT_SYMBOL(mod_page_state_offset); +EXPORT_SYMBOL(dec_zone_page_state); -void __get_zone_counts(unsigned long *active, unsigned long *inactive, - unsigned long *free, struct pglist_data *pgdat) +/* + * Update the zone counters for one cpu. + */ +void refresh_cpu_vm_stats(int cpu) { - struct zone *zones = pgdat->node_zones; + struct zone *zone; int i; + unsigned long flags; - *active = 0; - *inactive = 0; - *free = 0; - for (i = 0; i < MAX_NR_ZONES; i++) { - *active += zones[i].nr_active; - *inactive += zones[i].nr_inactive; - *free += zones[i].free_pages; + for_each_zone(zone) { + struct per_cpu_pageset *pcp; + + if (!populated_zone(zone)) + continue; + + pcp = zone_pcp(zone, cpu); + + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + if (pcp->vm_stat_diff[i]) { + local_irq_save(flags); + zone_page_state_add(pcp->vm_stat_diff[i], + zone, i); + pcp->vm_stat_diff[i] = 0; + local_irq_restore(flags); + } } } -void get_zone_counts(unsigned long *active, - unsigned long *inactive, unsigned long *free) +static void __refresh_cpu_vm_stats(void *dummy) { - struct pglist_data *pgdat; + refresh_cpu_vm_stats(smp_processor_id()); +} - *active = 0; - *inactive = 0; - *free = 0; - for_each_online_pgdat(pgdat) { - unsigned long l, m, n; - __get_zone_counts(&l, &m, &n, pgdat); - *active += l; - *inactive += m; - *free += n; +/* + * Consolidate all counters. + * + * Note that the result is less inaccurate but still inaccurate + * if concurrent processes are allowed to run. + */ +void refresh_vm_stats(void) +{ + on_each_cpu(__refresh_cpu_vm_stats, NULL, 0, 1); +} +EXPORT_SYMBOL(refresh_vm_stats); + +#endif + +#ifdef CONFIG_NUMA +/* + * zonelist = the list of zones passed to the allocator + * z = the zone from which the allocation occurred. + * + * Must be called with interrupts disabled. + */ +void zone_statistics(struct zonelist *zonelist, struct zone *z) +{ + if (z->zone_pgdat == zonelist->zones[0]->zone_pgdat) { + __inc_zone_state(z, NUMA_HIT); + } else { + __inc_zone_state(z, NUMA_MISS); + __inc_zone_state(zonelist->zones[0], NUMA_FOREIGN); } + if (z->node == numa_node_id()) + __inc_zone_state(z, NUMA_LOCAL); + else + __inc_zone_state(z, NUMA_OTHER); } +#endif #ifdef CONFIG_PROC_FS @@ -203,23 +438,51 @@ struct seq_operations fragmentation_op = { .show = frag_show, }; +#ifdef CONFIG_ZONE_DMA32 +#define TEXT_FOR_DMA32(xx) xx "_dma32", +#else +#define TEXT_FOR_DMA32(xx) +#endif + +#ifdef CONFIG_HIGHMEM +#define TEXT_FOR_HIGHMEM(xx) xx "_high", +#else +#define TEXT_FOR_HIGHMEM(xx) +#endif + +#define TEXTS_FOR_ZONES(xx) xx "_dma", TEXT_FOR_DMA32(xx) xx "_normal", \ + TEXT_FOR_HIGHMEM(xx) + static char *vmstat_text[] = { + /* Zoned VM counters */ + "nr_anon_pages", + "nr_mapped", + "nr_file_pages", + "nr_slab_reclaimable", + "nr_slab_unreclaimable", + "nr_page_table_pages", "nr_dirty", "nr_writeback", "nr_unstable", - "nr_page_table_pages", - "nr_mapped", - "nr_slab", + "nr_bounce", + "nr_vmscan_write", + +#ifdef CONFIG_NUMA + "numa_hit", + "numa_miss", + "numa_foreign", + "numa_interleave", + "numa_local", + "numa_other", +#endif +#ifdef CONFIG_VM_EVENT_COUNTERS "pgpgin", "pgpgout", "pswpin", "pswpout", - "pgalloc_high", - "pgalloc_normal", - "pgalloc_dma32", - "pgalloc_dma", + TEXTS_FOR_ZONES("pgalloc") "pgfree", "pgactivate", @@ -228,25 +491,10 @@ static char *vmstat_text[] = { "pgfault", "pgmajfault", - "pgrefill_high", - "pgrefill_normal", - "pgrefill_dma32", - "pgrefill_dma", - - "pgsteal_high", - "pgsteal_normal", - "pgsteal_dma32", - "pgsteal_dma", - - "pgscan_kswapd_high", - "pgscan_kswapd_normal", - "pgscan_kswapd_dma32", - "pgscan_kswapd_dma", - - "pgscan_direct_high", - "pgscan_direct_normal", - "pgscan_direct_dma32", - "pgscan_direct_dma", + TEXTS_FOR_ZONES("pgrefill") + TEXTS_FOR_ZONES("pgsteal") + TEXTS_FOR_ZONES("pgscan_kswapd") + TEXTS_FOR_ZONES("pgscan_direct") "pginodesteal", "slabs_scanned", @@ -256,7 +504,7 @@ static char *vmstat_text[] = { "allocstall", "pgrotated", - "nr_bounce", +#endif }; /* @@ -297,6 +545,11 @@ static int zoneinfo_show(struct seq_file *m, void *arg) zone->nr_scan_active, zone->nr_scan_inactive, zone->spanned_pages, zone->present_pages); + + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + seq_printf(m, "\n %-12s %lu", vmstat_text[i], + zone_page_state(zone, i)); + seq_printf(m, "\n protection: (%lu", zone->lowmem_reserve[0]); @@ -327,20 +580,9 @@ static int zoneinfo_show(struct seq_file *m, void *arg) pageset->pcp[j].high, pageset->pcp[j].batch); } -#ifdef CONFIG_NUMA - seq_printf(m, - "\n numa_hit: %lu" - "\n numa_miss: %lu" - "\n numa_foreign: %lu" - "\n interleave_hit: %lu" - "\n local_node: %lu" - "\n other_node: %lu", - pageset->numa_hit, - pageset->numa_miss, - pageset->numa_foreign, - pageset->interleave_hit, - pageset->local_node, - pageset->other_node); +#ifdef CONFIG_SMP + seq_printf(m, "\n vm stats threshold: %d", + pageset->stat_threshold); #endif } seq_printf(m, @@ -368,19 +610,34 @@ struct seq_operations zoneinfo_op = { static void *vmstat_start(struct seq_file *m, loff_t *pos) { - struct page_state *ps; + unsigned long *v; +#ifdef CONFIG_VM_EVENT_COUNTERS + unsigned long *e; +#endif + int i; if (*pos >= ARRAY_SIZE(vmstat_text)) return NULL; - ps = kmalloc(sizeof(*ps), GFP_KERNEL); - m->private = ps; - if (!ps) +#ifdef CONFIG_VM_EVENT_COUNTERS + v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) + + sizeof(struct vm_event_state), GFP_KERNEL); +#else + v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long), + GFP_KERNEL); +#endif + m->private = v; + if (!v) return ERR_PTR(-ENOMEM); - get_full_page_state(ps); - ps->pgpgin /= 2; /* sectors -> kbytes */ - ps->pgpgout /= 2; - return (unsigned long *)ps + *pos; + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + v[i] = global_page_state(i); +#ifdef CONFIG_VM_EVENT_COUNTERS + e = v + NR_VM_ZONE_STAT_ITEMS; + all_vm_events(e); + e[PGPGIN] /= 2; /* sectors -> kbytes */ + e[PGPGOUT] /= 2; +#endif + return v + *pos; } static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) @@ -415,3 +672,35 @@ struct seq_operations vmstat_op = { #endif /* CONFIG_PROC_FS */ +#ifdef CONFIG_SMP +/* + * Use the cpu notifier to insure that the thresholds are recalculated + * when necessary. + */ +static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_CANCELED: + case CPU_DEAD: + refresh_zone_stat_thresholds(); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata vmstat_notifier = + { &vmstat_cpuup_callback, NULL, 0 }; + +int __init setup_vmstat(void) +{ + refresh_zone_stat_thresholds(); + register_cpu_notifier(&vmstat_notifier); + return 0; +} +module_init(setup_vmstat) +#endif