From: Paul Jackson Date: Sun, 8 Jan 2006 09:00:56 +0000 (-0800) Subject: [PATCH] cpusets: swap migration interface X-Git-Tag: v2.6.16-rc1~884 X-Git-Url: https://err.no/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=45b07ef31d1182d2cfde7711327e3afb268bb1ac;p=linux-2.6 [PATCH] cpusets: swap migration interface Add a boolean "memory_migrate" to each cpuset, represented by a file containing "0" or "1" in each directory below /dev/cpuset. It defaults to false (file contains "0"). It can be set true by writing "1" to the file. If true, then anytime that a task is attached to the cpuset so marked, the pages of that task will be moved to that cpuset, preserving, to the extent practical, the cpuset-relative placement of the pages. Also anytime that a cpuset so marked has its memory placement changed (by writing to its "mems" file), the tasks in that cpuset will have their pages moved to the cpusets new nodes, preserving, to the extent practical, the cpuset-relative placement of the moved pages. Signed-off-by: Paul Jackson Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt index a09a8eb806..e2d9afc30d 100644 --- a/Documentation/cpusets.txt +++ b/Documentation/cpusets.txt @@ -192,6 +192,7 @@ containing the following files describing that cpuset: - cpus: list of CPUs in that cpuset - mems: list of Memory Nodes in that cpuset + - memory_migrate flag: if set, move pages to cpusets nodes - cpu_exclusive flag: is cpu placement exclusive? - mem_exclusive flag: is memory placement exclusive? - tasks: list of tasks (by pid) attached to that cpuset @@ -277,6 +278,30 @@ rewritten to the 'tasks' file of its cpuset. This is done to avoid impacting the scheduler code in the kernel with a check for changes in a tasks processor placement. +Normally, once a page is allocated (given a physical page +of main memory) then that page stays on whatever node it +was allocated, so long as it remains allocated, even if the +cpusets memory placement policy 'mems' subsequently changes. +If the cpuset flag file 'memory_migrate' is set true, then when +tasks are attached to that cpuset, any pages that task had +allocated to it on nodes in its previous cpuset are migrated +to the tasks new cpuset. Depending on the implementation, +this migration may either be done by swapping the page out, +so that the next time the page is referenced, it will be paged +into the tasks new cpuset, usually on the node where it was +referenced, or this migration may be done by directly copying +the pages from the tasks previous cpuset to the new cpuset, +where possible to the same node, relative to the new cpuset, +as the node that held the page, relative to the old cpuset. +Also if 'memory_migrate' is set true, then if that cpusets +'mems' file is modified, pages allocated to tasks in that +cpuset, that were on nodes in the previous setting of 'mems', +will be moved to nodes in the new setting of 'mems.' Again, +depending on the implementation, this might be done by swapping, +or by direct copying. In either case, pages that were not in +the tasks prior cpuset, or in the cpusets prior 'mems' setting, +will not be moved. + There is an exception to the above. If hotplug functionality is used to remove all the CPUs that are currently assigned to a cpuset, then the kernel will automatically update the cpus_allowed of all diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 3e61e82968..66247eff24 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -235,6 +235,13 @@ static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma, return NODE_DATA(0)->node_zonelists + gfp_zone(GFP_HIGHUSER); } +static inline int do_migrate_pages(struct mm_struct *mm, + const nodemask_t *from_nodes, + const nodemask_t *to_nodes, int flags) +{ + return 0; +} + static inline void check_highest_zone(int k) { } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 7430640f98..f63383e01e 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -87,6 +87,7 @@ struct cpuset { typedef enum { CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, + CS_MEMORY_MIGRATE, CS_REMOVED, CS_NOTIFY_ON_RELEASE } cpuset_flagbits_t; @@ -112,6 +113,11 @@ static inline int notify_on_release(const struct cpuset *cs) return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); } +static inline int is_memory_migrate(const struct cpuset *cs) +{ + return !!test_bit(CS_MEMORY_MIGRATE, &cs->flags); +} + /* * Increment this atomic integer everytime any cpuset changes its * mems_allowed value. Users of cpusets can track this generation @@ -602,16 +608,24 @@ static void refresh_mems(void) if (current->cpuset_mems_generation != my_cpusets_mem_gen) { struct cpuset *cs; nodemask_t oldmem = current->mems_allowed; + int migrate; down(&callback_sem); task_lock(current); cs = current->cpuset; + migrate = is_memory_migrate(cs); guarantee_online_mems(cs, ¤t->mems_allowed); current->cpuset_mems_generation = cs->mems_generation; task_unlock(current); up(&callback_sem); - if (!nodes_equal(oldmem, current->mems_allowed)) + if (!nodes_equal(oldmem, current->mems_allowed)) { numa_policy_rebind(&oldmem, ¤t->mems_allowed); + if (migrate) { + do_migrate_pages(current->mm, &oldmem, + ¤t->mems_allowed, + MPOL_MF_MOVE_ALL); + } + } } } @@ -795,7 +809,7 @@ static int update_nodemask(struct cpuset *cs, char *buf) /* * update_flag - read a 0 or a 1 in a file and update associated flag * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, - * CS_NOTIFY_ON_RELEASE) + * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE) * cs: the cpuset to update * buf: the buffer where we read the 0 or 1 * @@ -848,6 +862,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) struct task_struct *tsk; struct cpuset *oldcs; cpumask_t cpus; + nodemask_t from, to; if (sscanf(pidbuf, "%d", &pid) != 1) return -EIO; @@ -893,7 +908,12 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) guarantee_online_cpus(cs, &cpus); set_cpus_allowed(tsk, cpus); + from = oldcs->mems_allowed; + to = cs->mems_allowed; + up(&callback_sem); + if (is_memory_migrate(cs)) + do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL); put_task_struct(tsk); if (atomic_dec_and_test(&oldcs->count)) check_for_release(oldcs, ppathbuf); @@ -905,6 +925,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) typedef enum { FILE_ROOT, FILE_DIR, + FILE_MEMORY_MIGRATE, FILE_CPULIST, FILE_MEMLIST, FILE_CPU_EXCLUSIVE, @@ -960,6 +981,9 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us case FILE_NOTIFY_ON_RELEASE: retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer); break; + case FILE_MEMORY_MIGRATE: + retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer); + break; case FILE_TASKLIST: retval = attach_task(cs, buffer, &pathbuf); break; @@ -1060,6 +1084,9 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, case FILE_NOTIFY_ON_RELEASE: *s++ = notify_on_release(cs) ? '1' : '0'; break; + case FILE_MEMORY_MIGRATE: + *s++ = is_memory_migrate(cs) ? '1' : '0'; + break; default: retval = -EINVAL; goto out; @@ -1408,6 +1435,11 @@ static struct cftype cft_notify_on_release = { .private = FILE_NOTIFY_ON_RELEASE, }; +static struct cftype cft_memory_migrate = { + .name = "memory_migrate", + .private = FILE_MEMORY_MIGRATE, +}; + static int cpuset_populate_dir(struct dentry *cs_dentry) { int err; @@ -1422,6 +1454,8 @@ static int cpuset_populate_dir(struct dentry *cs_dentry) return err; if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0) return err; + if ((err = cpuset_add_file(cs_dentry, &cft_memory_migrate)) < 0) + return err; if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0) return err; return 0;