From 8ad4b1fb8205340dba16b63467bb23efc27264d6 Mon Sep 17 00:00:00 2001 From: Rohit Seth Date: Sun, 8 Jan 2006 01:00:40 -0800 Subject: [PATCH] [PATCH] Make high and batch sizes of per_cpu_pagelists configurable As recently there has been lot of traffic on the right values for batch and high water marks for per_cpu_pagelists. This patch makes these two variables configurable through /proc interface. A new tunable /proc/sys/vm/percpu_pagelist_fraction is added. This entry controls the fraction of pages at most in each zone that are allocated for each per cpu page list. The min value for this is 8. It means that we don't allow more than 1/8th of pages in each zone to be allocated in any single per_cpu_pagelist. The batch value of each per cpu pagelist is also updated as a result. It is set to pcp->high/4. The upper limit of batch is (PAGE_SHIFT * 8) Signed-off-by: Rohit Seth Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 17 +++++++++++++ include/linux/mmzone.h | 2 ++ include/linux/sysctl.h | 1 + kernel/sysctl.c | 12 +++++++++ mm/page_alloc.c | 49 +++++++++++++++++++++++++++++++++++++ 5 files changed, 81 insertions(+) diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 89ba1a42a1..6910c0136f 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -103,3 +103,20 @@ This is used to force the Linux VM to keep a minimum number of kilobytes free. The VM uses this number to compute a pages_min value for each lowmem zone in the system. Each lowmem zone gets a number of reserved free pages based proportionally on its size. + +============================================================== + +percpu_pagelist_fraction + +This is the fraction of pages at most (high mark pcp->high) in each zone that +are allocated for each per cpu page list. The min value for this is 8. It +means that we don't allow more than 1/8th of pages in each zone to be +allocated in any single per_cpu_pagelist. This entry only changes the value +of hot per cpu pagelists. User can specify a number like 100 to allocate +1/100th of each zone to each per cpu page list. + +The batch value of each per cpu pagelist is also updated as a result. It is +set to pcp->high/4. The upper limit of batch is (PAGE_SHIFT * 8) + +The initial value is zero. Kernel does not use this value at boot time to set +the high water marks for each per cpu page list. diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c34f4a2c62..2a89c132ba 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -437,6 +437,8 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *, extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1]; int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); +int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, struct file *, + void __user *, size_t *, loff_t *); #include /* Returns the number of the current Node. */ diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 4cd267fe87..7f472127b7 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -181,6 +181,7 @@ enum VM_LEGACY_VA_LAYOUT=27, /* legacy/compatibility virtual address space layout */ VM_SWAP_TOKEN_TIMEOUT=28, /* default time for token time out */ VM_DROP_PAGECACHE=29, /* int: nuke lots of pagecache */ + VM_PERCPU_PAGELIST_FRACTION=30,/* int: fraction of pages in each percpu_pagelist */ }; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8dcf6fd5b0..03b0598f23 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -69,6 +69,7 @@ extern int printk_ratelimit_jiffies; extern int printk_ratelimit_burst; extern int pid_max_min, pid_max_max; extern int sysctl_drop_caches; +extern int percpu_pagelist_fraction; #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) int unknown_nmi_panic; @@ -79,6 +80,7 @@ extern int proc_unknown_nmi_panic(ctl_table *, int, struct file *, /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ static int maxolduid = 65535; static int minolduid; +static int min_percpu_pagelist_fract = 8; static int ngroups_max = NGROUPS_MAX; @@ -794,6 +796,16 @@ static ctl_table vm_table[] = { .strategy = &sysctl_intvec, .extra1 = &zero, }, + { + .ctl_name = VM_PERCPU_PAGELIST_FRACTION, + .procname = "percpu_pagelist_fraction", + .data = &percpu_pagelist_fraction, + .maxlen = sizeof(percpu_pagelist_fraction), + .mode = 0644, + .proc_handler = &percpu_pagelist_fraction_sysctl_handler, + .strategy = &sysctl_intvec, + .extra1 = &min_percpu_pagelist_fract, + }, #ifdef CONFIG_MMU { .ctl_name = VM_MAX_MAP_COUNT, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5eeeadd9f6..2c46f697e8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -53,6 +53,7 @@ struct pglist_data *pgdat_list __read_mostly; unsigned long totalram_pages __read_mostly; unsigned long totalhigh_pages __read_mostly; long nr_swap_pages; +int percpu_pagelist_fraction; static void fastcall free_hot_cold_page(struct page *page, int cold); @@ -1831,6 +1832,24 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) INIT_LIST_HEAD(&pcp->list); } +/* + * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist + * to the value high for the pageset p. + */ + +static void setup_pagelist_highmark(struct per_cpu_pageset *p, + unsigned long high) +{ + struct per_cpu_pages *pcp; + + pcp = &p->pcp[0]; /* hot list */ + pcp->high = high; + pcp->batch = max(1UL, high/4); + if ((high/4) > (PAGE_SHIFT * 8)) + pcp->batch = PAGE_SHIFT * 8; +} + + #ifdef CONFIG_NUMA /* * Boot pageset table. One per cpu which is going to be used for all @@ -1868,6 +1887,10 @@ static int __devinit process_zones(int cpu) goto bad; setup_pageset(zone->pageset[cpu], zone_batchsize(zone)); + + if (percpu_pagelist_fraction) + setup_pagelist_highmark(zone_pcp(zone, cpu), + (zone->present_pages / percpu_pagelist_fraction)); } return 0; @@ -2567,6 +2590,32 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, return 0; } +/* + * percpu_pagelist_fraction - changes the pcp->high for each zone on each + * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist + * can have before it gets flushed back to buddy allocator. + */ + +int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) +{ + struct zone *zone; + unsigned int cpu; + int ret; + + ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); + if (!write || (ret == -EINVAL)) + return ret; + for_each_zone(zone) { + for_each_online_cpu(cpu) { + unsigned long high; + high = zone->present_pages / percpu_pagelist_fraction; + setup_pagelist_highmark(zone_pcp(zone, cpu), high); + } + } + return 0; +} + __initdata int hashdist = HASHDIST_DEFAULT; #ifdef CONFIG_NUMA -- 2.39.5