Merge master.kernel.org:/home/rmk/linux-2.6-arm

[linux-2.6] / kernel / sched_fair.c
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c

index bed2f71e63d9168318b1143326638ae43dad3aba..fb8994c6d4bb4bbe90a71f89341baee3cc6e9806 100644 (file)
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -726,21 +726,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
                 __enqueue_entity(cfs_rq, se);
  }
  
-static void update_avg(u64 *avg, u64 sample)
-{
-       s64 diff = sample - *avg;
-       *avg += diff >> 3;
-}
-
-static void update_avg_stats(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-       if (!se->last_wakeup)
-               return;
-
-       update_avg(&se->avg_overlap, se->sum_exec_runtime - se->last_wakeup);
-       se->last_wakeup = 0;
-}
-
  static void
  dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
  {
@@ -751,7 +736,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
  
         update_stats_dequeue(cfs_rq, se);
         if (sleep) {
-               update_avg_stats(cfs_rq, se);
  #ifdef CONFIG_SCHEDSTATS
                 if (entity_is_task(se)) {
                         struct task_struct *tsk = task_of(se);
@@ -894,7 +878,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
  #ifdef CONFIG_SCHED_HRTICK
  static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
  {
-       int requeue = rq->curr == p;
         struct sched_entity *se = &p->se;
         struct cfs_rq *cfs_rq = cfs_rq_of(se);
  
@@ -915,13 +898,13 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
                  * Don't schedule slices shorter than 10000ns, that just
                  * doesn't make sense. Rely on vruntime for fairness.
                  */
-               if (!requeue)
-                       delta = max(10000LL, delta);
+               if (rq->curr != p)
+                       delta = max_t(s64, 10000LL, delta);
  
-               hrtick_start(rq, delta, requeue);
+               hrtick_start(rq, delta);
         }
  }
-#else
+#else /* !CONFIG_SCHED_HRTICK */
  static inline void
  hrtick_start_fair(struct rq *rq, struct task_struct *p)
  {
@@ -1020,6 +1003,8 @@ static void yield_task_fair(struct rq *rq)
   * not idle and an idle cpu is available.  The span of cpus to
   * search starts with cpus closest then further out as needed,
   * so we always favor a closer, idle cpu.
+ * Domains may include CPUs that are not usable for migration,
+ * hence we need to mask them out (cpu_active_map)
   *
   * Returns the CPU we should wake onto.
   */
@@ -1047,7 +1032,8 @@ static int wake_idle(int cpu, struct task_struct *p)
                     || ((sd->flags & SD_WAKE_IDLE_FAR)
                         && !task_hot(p, task_rq(p)->clock, sd))) {
                         cpus_and(tmp, sd->span, p->cpus_allowed);
-                       for_each_cpu_mask(i, tmp) {
+                       cpus_and(tmp, tmp, cpu_active_map);
+                       for_each_cpu_mask_nr(i, tmp) {
                                 if (idle_cpu(i)) {
                                         if (i != task_cpu(p)) {
                                                 schedstat_inc(p,
@@ -1062,7 +1048,7 @@ static int wake_idle(int cpu, struct task_struct *p)
         }
         return cpu;
  }
-#else
+#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
  static inline int wake_idle(int cpu, struct task_struct *p)
  {
         return cpu;
@@ -1074,10 +1060,50 @@ static inline int wake_idle(int cpu, struct task_struct *p)
  static const struct sched_class fair_sched_class;
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
-static unsigned long effective_load(struct task_group *tg, int cpu,
-               unsigned long wl, unsigned long wg)
+/*
+ * effective_load() calculates the load change as seen from the root_task_group
+ *
+ * Adding load to a group doesn't make a group heavier, but can cause movement
+ * of group shares between cpus. Assuming the shares were perfectly aligned one
+ * can calculate the shift in shares.
+ *
+ * The problem is that perfectly aligning the shares is rather expensive, hence
+ * we try to avoid doing that too often - see update_shares(), which ratelimits
+ * this change.
+ *
+ * We compensate this by not only taking the current delta into account, but
+ * also considering the delta between when the shares were last adjusted and
+ * now.
+ *
+ * We still saw a performance dip, some tracing learned us that between
+ * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
+ * significantly. Therefore try to bias the error in direction of failing
+ * the affine wakeup.
+ *
+ */
+static long effective_load(struct task_group *tg, int cpu,
+               long wl, long wg)
  {
         struct sched_entity *se = tg->se[cpu];
+       long more_w;
+
+       if (!tg->parent)
+               return wl;
+
+       /*
+        * By not taking the decrease of shares on the other cpu into
+        * account our error leans towards reducing the affine wakeups.
+        */
+       if (!wl && sched_feat(ASYM_EFF_LOAD))
+               return wl;
+
+       /*
+        * Instead of using this increment, also add the difference
+        * between when the shares were last updated and now.
+        */
+       more_w = se->my_q->load.weight - se->my_q->rq_weight;
+       wl += more_w;
+       wg += more_w;
  
         for_each_sched_entity(se) {
  #define D(n) (likely(n) ? (n) : 1)
@@ -1086,7 +1112,7 @@ static unsigned long effective_load(struct task_group *tg, int cpu,
  
                 S = se->my_q->tg->shares;
                 s = se->my_q->shares;
-               rw = se->my_q->load.weight;
+               rw = se->my_q->rq_weight;
  
                 a = S*(rw + wl);
                 b = S*rw + s*wg;
@@ -1156,9 +1182,9 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
          * a reasonable amount of time then attract this newly
          * woken task:
          */
-       if (sync && balanced && curr->sched_class == &fair_sched_class) {
+       if (sync && balanced) {
                 if (curr->se.avg_overlap < sysctl_sched_migration_cost &&
-                               p->se.avg_overlap < sysctl_sched_migration_cost)
+                   p->se.avg_overlap < sysctl_sched_migration_cost)
                         return 1;
         }
  
@@ -1319,7 +1345,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
                 return;
         }
  
-       se->last_wakeup = se->sum_exec_runtime;
         if (unlikely(se == pse))
                 return;
  
@@ -1417,18 +1442,23 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
         struct task_struct *p = NULL;
         struct sched_entity *se;
  
-       while (next != &cfs_rq->tasks) {
+       if (next == &cfs_rq->tasks)
+               return NULL;
+
+       /* Skip over entities that are not tasks */
+       do {
                 se = list_entry(next, struct sched_entity, group_node);
                 next = next->next;
+       } while (next != &cfs_rq->tasks && !entity_is_task(se));
  
-               /* Skip over entities that are not tasks */
-               if (entity_is_task(se)) {
-                       p = task_of(se);
-                       break;
-               }
-       }
+       if (next == &cfs_rq->tasks)
+               return NULL;
  
         cfs_rq->balance_iterator = next;
+
+       if (entity_is_task(se))
+               p = task_of(se);
+
         return p;
  }
  
@@ -1546,7 +1576,7 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
  
         return 0;
  }
-#endif
+#endif /* CONFIG_SMP */
  
  /*
   * scheduler tick hitting a task of our scheduling class: