]> err.no Git - linux-2.6/blobdiff - kernel/sched.c
Merge branch 'master' of ssh://rsync.linux-nfs.org/home/trondmy/www_sites/rsync.linux...
[linux-2.6] / kernel / sched.c
index 6da13bba3e23c18add93ec275e904106dfdda2af..1f31a528fdba8ef941f5a75acb65c3ac6a4bf815 100644 (file)
@@ -294,6 +294,10 @@ static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
 
 static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
 {
+#ifdef CONFIG_DEBUG_SPINLOCK
+       /* this is a valid case when another task releases the spinlock */
+       rq->lock.owner = current;
+#endif
        spin_unlock_irq(&rq->lock);
 }
 
@@ -1529,10 +1533,6 @@ static inline void finish_task_switch(runqueue_t *rq, task_t *prev)
         *              Manfred Spraul <manfred@colorfullife.com>
         */
        prev_task_flags = prev->flags;
-#ifdef CONFIG_DEBUG_SPINLOCK
-       /* this is a valid case when another task releases the spinlock */
-       rq->lock.owner = current;
-#endif
        finish_arch_switch(prev);
        finish_lock_switch(rq, prev);
        if (mm)
@@ -1906,10 +1906,11 @@ out:
  */
 static struct sched_group *
 find_busiest_group(struct sched_domain *sd, int this_cpu,
-                  unsigned long *imbalance, enum idle_type idle)
+                  unsigned long *imbalance, enum idle_type idle, int *sd_idle)
 {
        struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
        unsigned long max_load, avg_load, total_load, this_load, total_pwr;
+       unsigned long max_pull;
        int load_idx;
 
        max_load = this_load = total_load = total_pwr = 0;
@@ -1931,6 +1932,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                avg_load = 0;
 
                for_each_cpu_mask(i, group->cpumask) {
+                       if (*sd_idle && !idle_cpu(i))
+                               *sd_idle = 0;
+
                        /* Bias balancing toward cpus of our domain */
                        if (local_group)
                                load = target_load(i, load_idx);
@@ -1956,7 +1960,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                group = group->next;
        } while (group != sd->groups);
 
-       if (!busiest || this_load >= max_load)
+       if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE)
                goto out_balanced;
 
        avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
@@ -1976,8 +1980,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
         * by pulling tasks to us.  Be careful of negative numbers as they'll
         * appear as very large values with unsigned longs.
         */
+
+       /* Don't want to pull so many tasks that a group would go idle */
+       max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE);
+
        /* How much load to actually move to equalise the imbalance */
-       *imbalance = min((max_load - avg_load) * busiest->cpu_power,
+       *imbalance = min(max_pull * busiest->cpu_power,
                                (avg_load - this_load) * this->cpu_power)
                        / SCHED_LOAD_SCALE;
 
@@ -2074,11 +2082,14 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
        unsigned long imbalance;
        int nr_moved, all_pinned = 0;
        int active_balance = 0;
+       int sd_idle = 0;
+
+       if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER)
+               sd_idle = 1;
 
-       spin_lock(&this_rq->lock);
        schedstat_inc(sd, lb_cnt[idle]);
 
-       group = find_busiest_group(sd, this_cpu, &imbalance, idle);
+       group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle);
        if (!group) {
                schedstat_inc(sd, lb_nobusyg[idle]);
                goto out_balanced;
@@ -2102,19 +2113,16 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
                 * still unbalanced. nr_moved simply stays zero, so it is
                 * correctly treated as an imbalance.
                 */
-               double_lock_balance(this_rq, busiest);
+               double_rq_lock(this_rq, busiest);
                nr_moved = move_tasks(this_rq, this_cpu, busiest,
-                                               imbalance, sd, idle,
-                                               &all_pinned);
-               spin_unlock(&busiest->lock);
+                                       imbalance, sd, idle, &all_pinned);
+               double_rq_unlock(this_rq, busiest);
 
                /* All tasks on this runqueue were pinned by CPU affinity */
                if (unlikely(all_pinned))
                        goto out_balanced;
        }
 
-       spin_unlock(&this_rq->lock);
-
        if (!nr_moved) {
                schedstat_inc(sd, lb_failed[idle]);
                sd->nr_balance_failed++;
@@ -2122,6 +2130,16 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
                if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
 
                        spin_lock(&busiest->lock);
+
+                       /* don't kick the migration_thread, if the curr
+                        * task on busiest cpu can't be moved to this_cpu
+                        */
+                       if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
+                               spin_unlock(&busiest->lock);
+                               all_pinned = 1;
+                               goto out_one_pinned;
+                       }
+
                        if (!busiest->active_balance) {
                                busiest->active_balance = 1;
                                busiest->push_cpu = this_cpu;
@@ -2154,19 +2172,23 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
                        sd->balance_interval *= 2;
        }
 
+       if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+               return -1;
        return nr_moved;
 
 out_balanced:
-       spin_unlock(&this_rq->lock);
-
        schedstat_inc(sd, lb_balanced[idle]);
 
        sd->nr_balance_failed = 0;
+
+out_one_pinned:
        /* tune up the balancing interval */
        if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
                        (sd->balance_interval < sd->max_interval))
                sd->balance_interval *= 2;
 
+       if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+               return -1;
        return 0;
 }
 
@@ -2184,9 +2206,13 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
        runqueue_t *busiest = NULL;
        unsigned long imbalance;
        int nr_moved = 0;
+       int sd_idle = 0;
+
+       if (sd->flags & SD_SHARE_CPUPOWER)
+               sd_idle = 1;
 
        schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
-       group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
+       group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle);
        if (!group) {
                schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
                goto out_balanced;
@@ -2200,22 +2226,30 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
 
        BUG_ON(busiest == this_rq);
 
-       /* Attempt to move tasks */
-       double_lock_balance(this_rq, busiest);
-
        schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
-       nr_moved = move_tasks(this_rq, this_cpu, busiest,
+
+       nr_moved = 0;
+       if (busiest->nr_running > 1) {
+               /* Attempt to move tasks */
+               double_lock_balance(this_rq, busiest);
+               nr_moved = move_tasks(this_rq, this_cpu, busiest,
                                        imbalance, sd, NEWLY_IDLE, NULL);
-       if (!nr_moved)
+               spin_unlock(&busiest->lock);
+       }
+
+       if (!nr_moved) {
                schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
-       else
+               if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+                       return -1;
+       } else
                sd->nr_balance_failed = 0;
 
-       spin_unlock(&busiest->lock);
        return nr_moved;
 
 out_balanced:
        schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
+       if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+               return -1;
        sd->nr_balance_failed = 0;
        return 0;
 }
@@ -2340,7 +2374,11 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
 
                if (j - sd->last_balance >= interval) {
                        if (load_balance(this_cpu, this_rq, sd, idle)) {
-                               /* We've pulled tasks over so no longer idle */
+                               /*
+                                * We've pulled tasks over so either we're no
+                                * longer idle, or one of our SMT siblings is
+                                * not idle.
+                                */
                                idle = NOT_IDLE;
                        }
                        sd->last_balance += interval;
@@ -2650,6 +2688,16 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
         */
 }
 
+/*
+ * number of 'lost' timeslices this task wont be able to fully
+ * utilize, if another task runs on a sibling. This models the
+ * slowdown effect of other tasks running on siblings:
+ */
+static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd)
+{
+       return p->time_slice * (100 - sd->per_cpu_gain) / 100;
+}
+
 static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
 {
        struct sched_domain *tmp, *sd = NULL;
@@ -2714,8 +2762,9 @@ static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
                                (sd->per_cpu_gain * DEF_TIMESLICE / 100))
                                        ret = 1;
                } else
-                       if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) /
-                               100) > task_timeslice(p)))
+                       if (smt_curr->static_prio < p->static_prio &&
+                               !TASK_PREEMPTS_CURR(p, smt_rq) &&
+                               smt_slice(smt_curr, sd) > task_timeslice(p))
                                        ret = 1;
 
 check_smt_task:
@@ -2737,8 +2786,8 @@ check_smt_task:
                                (sd->per_cpu_gain * DEF_TIMESLICE / 100))
                                        resched_task(smt_curr);
                } else {
-                       if ((p->time_slice * (100 - sd->per_cpu_gain) / 100) >
-                               task_timeslice(smt_curr))
+                       if (TASK_PREEMPTS_CURR(p, smt_rq) &&
+                               smt_slice(p, sd) > task_timeslice(smt_curr))
                                        resched_task(smt_curr);
                        else
                                wakeup_busy_runqueue(smt_rq);
@@ -3907,7 +3956,7 @@ asmlinkage long sys_sched_yield(void)
        if (rt_task(current))
                target = rq->active;
 
-       if (current->array->nr_active == 1) {
+       if (array->nr_active == 1) {
                schedstat_inc(rq, yld_act_empty);
                if (!rq->expired->nr_active)
                        schedstat_inc(rq, yld_both_empty);
@@ -5553,3 +5602,47 @@ void normalize_rt_tasks(void)
 }
 
 #endif /* CONFIG_MAGIC_SYSRQ */
+
+#ifdef CONFIG_IA64
+/*
+ * These functions are only useful for the IA64 MCA handling.
+ *
+ * They can only be called when the whole system has been
+ * stopped - every CPU needs to be quiescent, and no scheduling
+ * activity can take place. Using them for anything else would
+ * be a serious bug, and as a result, they aren't even visible
+ * under any other configuration.
+ */
+
+/**
+ * curr_task - return the current task for a given cpu.
+ * @cpu: the processor in question.
+ *
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ */
+task_t *curr_task(int cpu)
+{
+       return cpu_curr(cpu);
+}
+
+/**
+ * set_curr_task - set the current task for a given cpu.
+ * @cpu: the processor in question.
+ * @p: the task pointer to set.
+ *
+ * Description: This function must only be used when non-maskable interrupts
+ * are serviced on a separate stack.  It allows the architecture to switch the
+ * notion of the current task on a cpu in a non-blocking manner.  This function
+ * must be called with all CPU's synchronized, and interrupts disabled, the
+ * and caller must save the original value of the current task (see
+ * curr_task() above) and restore that value before reenabling interrupts and
+ * re-starting the system.
+ *
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ */
+void set_curr_task(int cpu, task_t *p)
+{
+       cpu_curr(cpu) = p;
+}
+
+#endif