Merge branch 'upstream' of master.kernel.org:/pub/scm/linux/kernel/git/jgarzik/misc-2.6

author Linus Torvalds <torvalds@g5.osdl.org>

Thu, 8 Sep 2005 00:28:25 +0000 (17:28 -0700)

committer Linus Torvalds <torvalds@g5.osdl.org>

Thu, 8 Sep 2005 00:28:25 +0000 (17:28 -0700)
author Linus Torvalds <torvalds@g5.osdl.org>
Thu, 8 Sep 2005 00:28:25 +0000 (17:28 -0700)
committer Linus Torvalds <torvalds@g5.osdl.org>
Thu, 8 Sep 2005 00:28:25 +0000 (17:28 -0700)
diff --combined kernel/sched.c

index 9508527845df1d91beecb758ddd4f5626b11e77c,f41fa94d2070b5f525f59775dff0fd9b67e567f1..18b95520a2e29bdfa181aecb913f0ea50995f609
--- 1/kernel/sched.c
--- 2/kernel/sched.c
+++ b/kernel/sched.c
@@@ -1478,6 -1478,7 +1478,7 @@@ static inline void prepare_task_switch(
   
   /**
    * finish_task_switch - clean up after a task-switch
+  * @rq: runqueue associated with task-switch
    * @prev: the thread we just switched away from.
    *
    * finish_task_switch must be called after the context switch, paired
@@@ -4779,7 -4780,7 +4780,7 @@@ static int sd_parent_degenerate(struct 
    * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
    * hold the hotplug lock.
    */
- -void cpu_attach_domain(struct sched_domain *sd, int cpu)
+ +static void cpu_attach_domain(struct sched_domain *sd, int cpu)
   {
         runqueue_t *rq = cpu_rq(cpu);
         struct sched_domain *tmp;
@@@ -4802,7 -4803,7 +4803,7 @@@
   }
   
   /* cpus with isolated domains */
- -cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
+ +static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
   
   /* Setup the mask of cpus configured for isolated domains */
   static int __init isolated_cpu_setup(char *str)
@@@ -4830,8 -4831,8 +4831,8 @@@ __setup ("isolcpus=", isolated_cpu_setu
    * covered by the given span, and will set each group's ->cpumask correctly,
    * and ->cpu_power to 0.
    */
- -void init_sched_build_groups(struct sched_group groups[],
- -                      cpumask_t span, int (*group_fn)(int cpu))
+ +static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
+ +                                  int (*group_fn)(int cpu))
   {
         struct sched_group *first = NULL, *last = NULL;
         cpumask_t covered = CPU_MASK_NONE;
@@@ -4864,85 -4865,12 +4865,85 @@@
         last->next = first;
   }
   
+ +#define SD_NODES_PER_DOMAIN 16
   
- -#ifdef ARCH_HAS_SCHED_DOMAIN
- -extern void build_sched_domains(const cpumask_t *cpu_map);
- -extern void arch_init_sched_domains(const cpumask_t *cpu_map);
- -extern void arch_destroy_sched_domains(const cpumask_t *cpu_map);
- -#else
+ +#ifdef CONFIG_NUMA
+ +/**
+ + * find_next_best_node - find the next node to include in a sched_domain
+ + * @node: node whose sched_domain we're building
+ + * @used_nodes: nodes already in the sched_domain
+ + *
+ + * Find the next node to include in a given scheduling domain.  Simply
+ + * finds the closest node not already in the @used_nodes map.
+ + *
+ + * Should use nodemask_t.
+ + */
+ +static int find_next_best_node(int node, unsigned long *used_nodes)
+ +{
+ +      int i, n, val, min_val, best_node = 0;
+ +
+ +      min_val = INT_MAX;
+ +
+ +      for (i = 0; i < MAX_NUMNODES; i++) {
+ +              /* Start at @node */
+ +              n = (node + i) % MAX_NUMNODES;
+ +
+ +              if (!nr_cpus_node(n))
+ +                      continue;
+ +
+ +              /* Skip already used nodes */
+ +              if (test_bit(n, used_nodes))
+ +                      continue;
+ +
+ +              /* Simple min distance search */
+ +              val = node_distance(node, n);
+ +
+ +              if (val < min_val) {
+ +                      min_val = val;
+ +                      best_node = n;
+ +              }
+ +      }
+ +
+ +      set_bit(best_node, used_nodes);
+ +      return best_node;
+ +}
+ +
+ +/**
+ + * sched_domain_node_span - get a cpumask for a node's sched_domain
+ + * @node: node whose cpumask we're constructing
+ + * @size: number of nodes to include in this span
+ + *
+ + * Given a node, construct a good cpumask for its sched_domain to span.  It
+ + * should be one that prevents unnecessary balancing, but also spreads tasks
+ + * out optimally.
+ + */
+ +static cpumask_t sched_domain_node_span(int node)
+ +{
+ +      int i;
+ +      cpumask_t span, nodemask;
+ +      DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
+ +
+ +      cpus_clear(span);
+ +      bitmap_zero(used_nodes, MAX_NUMNODES);
+ +
+ +      nodemask = node_to_cpumask(node);
+ +      cpus_or(span, span, nodemask);
+ +      set_bit(node, used_nodes);
+ +
+ +      for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
+ +              int next_node = find_next_best_node(node, used_nodes);
+ +              nodemask = node_to_cpumask(next_node);
+ +              cpus_or(span, span, nodemask);
+ +      }
+ +
+ +      return span;
+ +}
+ +#endif
+ +
+ +/*
+ + * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
+ + * can switch it on easily if needed.
+ + */
   #ifdef CONFIG_SCHED_SMT
   static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
   static struct sched_group sched_group_cpus[NR_CPUS];
@@@ -4964,20 -4892,36 +4965,20 @@@ static int cpu_to_phys_group(int cpu
   }
   
   #ifdef CONFIG_NUMA
- -
- -static DEFINE_PER_CPU(struct sched_domain, node_domains);
- -static struct sched_group sched_group_nodes[MAX_NUMNODES];
- -static int cpu_to_node_group(int cpu)
- -{
- -      return cpu_to_node(cpu);
- -}
- -#endif
- -
- -#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
   /*
- - * The domains setup code relies on siblings not spanning
- - * multiple nodes. Make sure the architecture has a proper
- - * siblings map:
+ + * The init_sched_build_groups can't handle what we want to do with node
+ + * groups, so roll our own. Now each node has its own list of groups which
+ + * gets dynamically allocated.
    */
- -static void check_sibling_maps(void)
- -{
- -      int i, j;
+ +static DEFINE_PER_CPU(struct sched_domain, node_domains);
+ +static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
   
- -      for_each_online_cpu(i) {
- -              for_each_cpu_mask(j, cpu_sibling_map[i]) {
- -                      if (cpu_to_node(i) != cpu_to_node(j)) {
- -                              printk(KERN_INFO "warning: CPU %d siblings map "
- -                                      "to different node - isolating "
- -                                      "them.\n", i);
- -                              cpu_sibling_map[i] = cpumask_of_cpu(i);
- -                              break;
- -                      }
- -              }
- -      }
+ +static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
+ +static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];
+ +
+ +static int cpu_to_allnodes_group(int cpu)
+ +{
+ +      return cpu_to_node(cpu);
   }
   #endif
   
@@@ -4985,24 -4929,9 +4986,24 @@@
    * Build sched domains for a given set of cpus and attach the sched domains
    * to the individual cpus
    */
- -static void build_sched_domains(const cpumask_t *cpu_map)
+ +void build_sched_domains(const cpumask_t *cpu_map)
   {
         int i;
+ +#ifdef CONFIG_NUMA
+ +      struct sched_group **sched_group_nodes = NULL;
+ +      struct sched_group *sched_group_allnodes = NULL;
+ +
+ +      /*
+ +       * Allocate the per-node list of sched groups
+ +       */
+ +      sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
+ +                                         GFP_ATOMIC);
+ +      if (!sched_group_nodes) {
+ +              printk(KERN_WARNING "Can not alloc sched group node list\n");
+ +              return;
+ +      }
+ +      sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
+ +#endif
   
         /*
          * Set up domains for cpus specified by the cpu_map.
@@@ -5015,35 -4944,11 +5016,35 @@@
                 cpus_and(nodemask, nodemask, *cpu_map);
   
   #ifdef CONFIG_NUMA
+ +              if (cpus_weight(*cpu_map)
+ +                              > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
+ +                      if (!sched_group_allnodes) {
+ +                              sched_group_allnodes
+ +                                      = kmalloc(sizeof(struct sched_group)
+ +                                                      * MAX_NUMNODES,
+ +                                                GFP_KERNEL);
+ +                              if (!sched_group_allnodes) {
+ +                                      printk(KERN_WARNING
+ +                                      "Can not alloc allnodes sched group\n");
+ +                                      break;
+ +                              }
+ +                              sched_group_allnodes_bycpu[i]
+ +                                              = sched_group_allnodes;
+ +                      }
+ +                      sd = &per_cpu(allnodes_domains, i);
+ +                      *sd = SD_ALLNODES_INIT;
+ +                      sd->span = *cpu_map;
+ +                      group = cpu_to_allnodes_group(i);
+ +                      sd->groups = &sched_group_allnodes[group];
+ +                      p = sd;
+ +              } else
+ +                      p = NULL;
+ +
                 sd = &per_cpu(node_domains, i);
- -              group = cpu_to_node_group(i);
                 *sd = SD_NODE_INIT;
- -              sd->span = *cpu_map;
- -              sd->groups = &sched_group_nodes[group];
+ +              sd->span = sched_domain_node_span(cpu_to_node(i));
+ +              sd->parent = p;
+ +              cpus_and(sd->span, sd->span, *cpu_map);
   #endif
   
                 p = sd;
@@@ -5068,7 -4973,7 +5069,7 @@@
   
   #ifdef CONFIG_SCHED_SMT
         /* Set up CPU (sibling) groups */
- -      for_each_online_cpu(i) {
+ +      for_each_cpu_mask(i, *cpu_map) {
                 cpumask_t this_sibling_map = cpu_sibling_map[i];
                 cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
                 if (i != first_cpu(this_sibling_map))
@@@ -5093,77 -4998,8 +5094,77 @@@
   
   #ifdef CONFIG_NUMA
         /* Set up node groups */
- -      init_sched_build_groups(sched_group_nodes, *cpu_map,
- -                                      &cpu_to_node_group);
+ +      if (sched_group_allnodes)
+ +              init_sched_build_groups(sched_group_allnodes, *cpu_map,
+ +                                      &cpu_to_allnodes_group);
+ +
+ +      for (i = 0; i < MAX_NUMNODES; i++) {
+ +              /* Set up node groups */
+ +              struct sched_group *sg, *prev;
+ +              cpumask_t nodemask = node_to_cpumask(i);
+ +              cpumask_t domainspan;
+ +              cpumask_t covered = CPU_MASK_NONE;
+ +              int j;
+ +
+ +              cpus_and(nodemask, nodemask, *cpu_map);
+ +              if (cpus_empty(nodemask)) {
+ +                      sched_group_nodes[i] = NULL;
+ +                      continue;
+ +              }
+ +
+ +              domainspan = sched_domain_node_span(i);
+ +              cpus_and(domainspan, domainspan, *cpu_map);
+ +
+ +              sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+ +              sched_group_nodes[i] = sg;
+ +              for_each_cpu_mask(j, nodemask) {
+ +                      struct sched_domain *sd;
+ +                      sd = &per_cpu(node_domains, j);
+ +                      sd->groups = sg;
+ +                      if (sd->groups == NULL) {
+ +                              /* Turn off balancing if we have no groups */
+ +                              sd->flags = 0;
+ +                      }
+ +              }
+ +              if (!sg) {
+ +                      printk(KERN_WARNING
+ +                      "Can not alloc domain group for node %d\n", i);
+ +                      continue;
+ +              }
+ +              sg->cpu_power = 0;
+ +              sg->cpumask = nodemask;
+ +              cpus_or(covered, covered, nodemask);
+ +              prev = sg;
+ +
+ +              for (j = 0; j < MAX_NUMNODES; j++) {
+ +                      cpumask_t tmp, notcovered;
+ +                      int n = (i + j) % MAX_NUMNODES;
+ +
+ +                      cpus_complement(notcovered, covered);
+ +                      cpus_and(tmp, notcovered, *cpu_map);
+ +                      cpus_and(tmp, tmp, domainspan);
+ +                      if (cpus_empty(tmp))
+ +                              break;
+ +
+ +                      nodemask = node_to_cpumask(n);
+ +                      cpus_and(tmp, tmp, nodemask);
+ +                      if (cpus_empty(tmp))
+ +                              continue;
+ +
+ +                      sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+ +                      if (!sg) {
+ +                              printk(KERN_WARNING
+ +                              "Can not alloc domain group for node %d\n", j);
+ +                              break;
+ +                      }
+ +                      sg->cpu_power = 0;
+ +                      sg->cpumask = tmp;
+ +                      cpus_or(covered, covered, tmp);
+ +                      prev->next = sg;
+ +                      prev = sg;
+ +              }
+ +              prev->next = sched_group_nodes[i];
+ +      }
   #endif
   
         /* Calculate CPU power for physical packages and nodes */
@@@ -5182,46 -5018,14 +5183,46 @@@
                 sd->groups->cpu_power = power;
   
   #ifdef CONFIG_NUMA
- -              if (i == first_cpu(sd->groups->cpumask)) {
- -                      /* Only add "power" once for each physical package. */
- -                      sd = &per_cpu(node_domains, i);
- -                      sd->groups->cpu_power += power;
+ +              sd = &per_cpu(allnodes_domains, i);
+ +              if (sd->groups) {
+ +                      power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
+ +                              (cpus_weight(sd->groups->cpumask)-1) / 10;
+ +                      sd->groups->cpu_power = power;
                 }
   #endif
         }
   
+ +#ifdef CONFIG_NUMA
+ +      for (i = 0; i < MAX_NUMNODES; i++) {
+ +              struct sched_group *sg = sched_group_nodes[i];
+ +              int j;
+ +
+ +              if (sg == NULL)
+ +                      continue;
+ +next_sg:
+ +              for_each_cpu_mask(j, sg->cpumask) {
+ +                      struct sched_domain *sd;
+ +                      int power;
+ +
+ +                      sd = &per_cpu(phys_domains, j);
+ +                      if (j != first_cpu(sd->groups->cpumask)) {
+ +                              /*
+ +                               * Only add "power" once for each
+ +                               * physical package.
+ +                               */
+ +                              continue;
+ +                      }
+ +                      power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
+ +                              (cpus_weight(sd->groups->cpumask)-1) / 10;
+ +
+ +                      sg->cpu_power += power;
+ +              }
+ +              sg = sg->next;
+ +              if (sg != sched_group_nodes[i])
+ +                      goto next_sg;
+ +      }
+ +#endif
+ +
         /* Attach the domains */
         for_each_cpu_mask(i, *cpu_map) {
                 struct sched_domain *sd;
@@@ -5236,10 -5040,13 +5237,10 @@@
   /*
    * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
    */
- -static void arch_init_sched_domains(cpumask_t *cpu_map)
+ +static void arch_init_sched_domains(const cpumask_t *cpu_map)
   {
         cpumask_t cpu_default_map;
   
- -#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
- -      check_sibling_maps();
- -#endif
         /*
          * Setup mask for cpus without special case scheduling requirements.
          * For now this just excludes isolated cpus, but could be used to
@@@ -5252,47 -5059,10 +5253,47 @@@
   
   static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
   {
- -      /* Do nothing: everything is statically allocated. */
- -}
+ +#ifdef CONFIG_NUMA
+ +      int i;
+ +      int cpu;
+ +
+ +      for_each_cpu_mask(cpu, *cpu_map) {
+ +              struct sched_group *sched_group_allnodes
+ +                      = sched_group_allnodes_bycpu[cpu];
+ +              struct sched_group **sched_group_nodes
+ +                      = sched_group_nodes_bycpu[cpu];
+ +
+ +              if (sched_group_allnodes) {
+ +                      kfree(sched_group_allnodes);
+ +                      sched_group_allnodes_bycpu[cpu] = NULL;
+ +              }
+ +
+ +              if (!sched_group_nodes)
+ +                      continue;
+ +
+ +              for (i = 0; i < MAX_NUMNODES; i++) {
+ +                      cpumask_t nodemask = node_to_cpumask(i);
+ +                      struct sched_group *oldsg, *sg = sched_group_nodes[i];
   
- -#endif /* ARCH_HAS_SCHED_DOMAIN */
+ +                      cpus_and(nodemask, nodemask, *cpu_map);
+ +                      if (cpus_empty(nodemask))
+ +                              continue;
+ +
+ +                      if (sg == NULL)
+ +                              continue;
+ +                      sg = sg->next;
+ +next_sg:
+ +                      oldsg = sg;
+ +                      sg = sg->next;
+ +                      kfree(oldsg);
+ +                      if (oldsg != sched_group_nodes[i])
+ +                              goto next_sg;
+ +              }
+ +              kfree(sched_group_nodes);
+ +              sched_group_nodes_bycpu[cpu] = NULL;
+ +      }
+ +#endif
+ +}
   
   /*
    * Detach sched domains from a group of cpus specified in cpu_map
author	Linus Torvalds <torvalds@g5.osdl.org>
	Thu, 8 Sep 2005 00:28:25 +0000 (17:28 -0700)
committer	Linus Torvalds <torvalds@g5.osdl.org>
	Thu, 8 Sep 2005 00:28:25 +0000 (17:28 -0700)