x86: Convert cpu_core_map to be a per cpu variable

[linux-2.6] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index 791dd08c692f2eadb44df1ffc6bbd8e4978e07d2..bba57adb95044ba215caa58ee7bdd514ccd71c1c 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1005,6 +1005,28 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
  
  #ifdef CONFIG_SMP
  
+/*
+ * Is this task likely cache-hot:
+ */
+static inline int
+task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
+{
+       s64 delta;
+
+       if (p->sched_class != &fair_sched_class)
+               return 0;
+
+       if (sysctl_sched_migration_cost == -1)
+               return 1;
+       if (sysctl_sched_migration_cost == 0)
+               return 0;
+
+       delta = now - p->se.exec_start;
+
+       return delta < (s64)sysctl_sched_migration_cost;
+}
+
+
  void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
  {
         int old_cpu = task_cpu(p);
@@ -1022,6 +1044,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
                 p->se.sleep_start -= clock_offset;
         if (p->se.block_start)
                 p->se.block_start -= clock_offset;
+       if (old_cpu != new_cpu) {
+               schedstat_inc(p, se.nr_migrations);
+               if (task_hot(p, old_rq->clock, NULL))
+                       schedstat_inc(p, se.nr_forced2_migrations);
+       }
  #endif
         p->se.vruntime -= old_cfsrq->min_vruntime -
                                          new_cfsrq->min_vruntime;
@@ -1394,8 +1421,13 @@ static int wake_idle(int cpu, struct task_struct *p)
                 if (sd->flags & SD_WAKE_IDLE) {
                         cpus_and(tmp, sd->span, p->cpus_allowed);
                         for_each_cpu_mask(i, tmp) {
-                               if (idle_cpu(i))
+                               if (idle_cpu(i)) {
+                                       if (i != task_cpu(p)) {
+                                               schedstat_inc(p,
+                                                       se.nr_wakeups_idle);
+                                       }
                                         return i;
+                               }
                         }
                 } else {
                         break;
@@ -1426,7 +1458,7 @@ static inline int wake_idle(int cpu, struct task_struct *p)
   */
  static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
  {
-       int cpu, this_cpu, success = 0;
+       int cpu, orig_cpu, this_cpu, success = 0;
         unsigned long flags;
         long old_state;
         struct rq *rq;
@@ -1445,6 +1477,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
                 goto out_running;
  
         cpu = task_cpu(p);
+       orig_cpu = cpu;
         this_cpu = smp_processor_id();
  
  #ifdef CONFIG_SMP
@@ -1488,6 +1521,13 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
                         unsigned long tl = this_load;
                         unsigned long tl_per_task;
  
+                       /*
+                        * Attract cache-cold tasks on sync wakeups:
+                        */
+                       if (sync && !task_hot(p, rq->clock, this_sd))
+                               goto out_set_cpu;
+
+                       schedstat_inc(p, se.nr_wakeups_affine_attempts);
                         tl_per_task = cpu_avg_load_per_task(this_cpu);
  
                         /*
@@ -1507,6 +1547,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
                                  * there is no bad imbalance.
                                  */
                                 schedstat_inc(this_sd, ttwu_move_affine);
+                               schedstat_inc(p, se.nr_wakeups_affine);
                                 goto out_set_cpu;
                         }
                 }
@@ -1518,6 +1559,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
                 if (this_sd->flags & SD_WAKE_BALANCE) {
                         if (imbalance*this_load <= 100*load) {
                                 schedstat_inc(this_sd, ttwu_move_balance);
+                               schedstat_inc(p, se.nr_wakeups_passive);
                                 goto out_set_cpu;
                         }
                 }
@@ -1543,18 +1585,18 @@ out_set_cpu:
  
  out_activate:
  #endif /* CONFIG_SMP */
+       schedstat_inc(p, se.nr_wakeups);
+       if (sync)
+               schedstat_inc(p, se.nr_wakeups_sync);
+       if (orig_cpu != cpu)
+               schedstat_inc(p, se.nr_wakeups_migrate);
+       if (cpu == this_cpu)
+               schedstat_inc(p, se.nr_wakeups_local);
+       else
+               schedstat_inc(p, se.nr_wakeups_remote);
         update_rq_clock(rq);
         activate_task(rq, p, 1);
-       /*
-        * Sync wakeups (i.e. those types of wakeups where the waker
-        * has indicated that it will leave the CPU in short order)
-        * don't trigger a preemption, if the woken up task will run on
-        * this cpu. (in this case the 'I will reschedule' promise of
-        * the waker guarantees that the freshly woken up task is going
-        * to be considered on this CPU.)
-        */
-       if (!sync || cpu != this_cpu)
-               check_preempt_curr(rq, p);
+       check_preempt_curr(rq, p);
         success = 1;
  
  out_running:
@@ -2132,13 +2174,38 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
          * 2) cannot be migrated to this CPU due to cpus_allowed, or
          * 3) are cache-hot on their current CPU.
          */
-       if (!cpu_isset(this_cpu, p->cpus_allowed))
+       if (!cpu_isset(this_cpu, p->cpus_allowed)) {
+               schedstat_inc(p, se.nr_failed_migrations_affine);
                 return 0;
+       }
         *all_pinned = 0;
  
-       if (task_running(rq, p))
+       if (task_running(rq, p)) {
+               schedstat_inc(p, se.nr_failed_migrations_running);
                 return 0;
+       }
+
+       /*
+        * Aggressive migration if:
+        * 1) task is cache cold, or
+        * 2) too many balance attempts have failed.
+        */
  
+       if (!task_hot(p, rq->clock, sd) ||
+                       sd->nr_balance_failed > sd->cache_nice_tries) {
+#ifdef CONFIG_SCHEDSTATS
+               if (task_hot(p, rq->clock, sd)) {
+                       schedstat_inc(sd, lb_hot_gained[idle]);
+                       schedstat_inc(p, se.nr_forced_migrations);
+               }
+#endif
+               return 1;
+       }
+
+       if (task_hot(p, rq->clock, sd)) {
+               schedstat_inc(p, se.nr_failed_migrations_hot);
+               return 0;
+       }
         return 1;
  }
  
@@ -3232,6 +3299,25 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
                 cpustat->user = cputime64_add(cpustat->user, tmp);
  }
  
+/*
+ * Account guest cpu time to a process.
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in virtual machine since the last update
+ */
+void account_guest_time(struct task_struct *p, cputime_t cputime)
+{
+       cputime64_t tmp;
+       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+
+       tmp = cputime_to_cputime64(cputime);
+
+       p->utime = cputime_add(p->utime, cputime);
+       p->gtime = cputime_add(p->gtime, cputime);
+
+       cpustat->user = cputime64_add(cpustat->user, tmp);
+       cpustat->guest = cputime64_add(cpustat->guest, tmp);
+}
+
  /*
   * Account system cpu time to a process.
   * @p: the process that the cpu time gets accounted to
@@ -3245,6 +3331,12 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
         struct rq *rq = this_rq();
         cputime64_t tmp;
  
+       if (p->flags & PF_VCPU) {
+               account_guest_time(p, cputime);
+               p->flags &= ~PF_VCPU;
+               return;
+       }
+
         p->stime = cputime_add(p->stime, cputime);
  
         /* Add system time to cpustat. */
@@ -5173,14 +5265,23 @@ static struct ctl_table sd_ctl_root[] = {
  static struct ctl_table *sd_alloc_ctl_entry(int n)
  {
         struct ctl_table *entry =
-               kmalloc(n * sizeof(struct ctl_table), GFP_KERNEL);
-
-       BUG_ON(!entry);
-       memset(entry, 0, n * sizeof(struct ctl_table));
+               kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
  
         return entry;
  }
  
+static void sd_free_ctl_entry(struct ctl_table **tablep)
+{
+       struct ctl_table *entry = *tablep;
+
+       for (entry = *tablep; entry->procname; entry++)
+               if (entry->child)
+                       sd_free_ctl_entry(&entry->child);
+
+       kfree(*tablep);
+       *tablep = NULL;
+}
+
  static void
  set_table_entry(struct ctl_table *entry,
                 const char *procname, void *data, int maxlen,
@@ -5198,6 +5299,9 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
  {
         struct ctl_table *table = sd_alloc_ctl_entry(12);
  
+       if (table == NULL)
+               return NULL;
+
         set_table_entry(&table[0], "min_interval", &sd->min_interval,
                 sizeof(long), 0644, proc_doulongvec_minmax);
         set_table_entry(&table[1], "max_interval", &sd->max_interval,
@@ -5221,6 +5325,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
                 sizeof(int), 0644, proc_dointvec_minmax);
         set_table_entry(&table[10], "flags", &sd->flags,
                 sizeof(int), 0644, proc_dointvec_minmax);
+       /* &table[11] is terminator */
  
         return table;
  }
@@ -5235,6 +5340,8 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
         for_each_domain(cpu, sd)
                 domain_num++;
         entry = table = sd_alloc_ctl_entry(domain_num + 1);
+       if (table == NULL)
+               return NULL;
  
         i = 0;
         for_each_domain(cpu, sd) {
@@ -5249,24 +5356,38 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
  }
  
  static struct ctl_table_header *sd_sysctl_header;
-static void init_sched_domain_sysctl(void)
+static void register_sched_domain_sysctl(void)
  {
         int i, cpu_num = num_online_cpus();
         struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
         char buf[32];
  
+       if (entry == NULL)
+               return;
+
         sd_ctl_dir[0].child = entry;
  
-       for (i = 0; i < cpu_num; i++, entry++) {
+       for_each_online_cpu(i) {
                 snprintf(buf, 32, "cpu%d", i);
                 entry->procname = kstrdup(buf, GFP_KERNEL);
                 entry->mode = 0555;
                 entry->child = sd_alloc_ctl_cpu_table(i);
+               entry++;
         }
         sd_sysctl_header = register_sysctl_table(sd_ctl_root);
  }
+
+static void unregister_sched_domain_sysctl(void)
+{
+       unregister_sysctl_table(sd_sysctl_header);
+       sd_sysctl_header = NULL;
+       sd_free_ctl_entry(&sd_ctl_dir[0].child);
+}
  #else
-static void init_sched_domain_sysctl(void)
+static void register_sched_domain_sysctl(void)
+{
+}
+static void unregister_sched_domain_sysctl(void)
  {
  }
  #endif
@@ -5948,7 +6069,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
         /*
          * Allocate the per-node list of sched groups
          */
-       sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES,
+       sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *),
                                            GFP_KERNEL);
         if (!sched_group_nodes) {
                 printk(KERN_WARNING "Can not alloc sched group node list\n");
@@ -6201,6 +6322,8 @@ static int arch_init_sched_domains(const cpumask_t *cpu_map)
  
         err = build_sched_domains(&cpu_default_map);
  
+       register_sched_domain_sysctl();
+
         return err;
  }
  
@@ -6217,6 +6340,8 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
  {
         int i;
  
+       unregister_sched_domain_sysctl();
+
         for_each_cpu_mask(i, *cpu_map)
                 cpu_attach_domain(NULL, i);
         synchronize_sched();
@@ -6247,6 +6372,8 @@ int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
         if (!err && !cpus_empty(*partition2))
                 err = build_sched_domains(partition2);
  
+       register_sched_domain_sysctl();
+
         return err;
  }
  
@@ -6378,8 +6505,6 @@ void __init sched_init_smp(void)
         /* XXX: Theoretical race here - CPU may be hotplugged now */
         hotcpu_notifier(update_sched_domains, 0);
  
-       init_sched_domain_sysctl();
-
         /* Move init over to a non-isolated CPU */
         if (set_cpus_allowed(current, non_isolated_cpus) < 0)
                 BUG();