* by Davide Libenzi, preemptible kernel bits by Robert Love.
* 2003-09-03 Interactivity tuning by Con Kolivas.
* 2004-04-02 Scheduler domains code by Nick Piggin
+ * 2007-04-15 Work begun on replacing all interactivity tuning with a
+ * fair scheduling design by Con Kolivas.
+ * 2007-05-05 Load balancing (smp-nice) and other improvements
+ * by Peter Williams
+ * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
+ * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
*/
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/nmi.h>
#include <linux/init.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
#include <linux/highmem.h>
#include <linux/smp_lock.h>
#include <asm/mmu_context.h>
#include <linux/kprobes.h>
#include <linux/delayacct.h>
#include <linux/reciprocal_div.h>
+#include <linux/unistd.h>
#include <asm/tlb.h>
-#include <asm/unistd.h>
/*
* Scheduler clock - returns current time in nanosec units.
*/
#define MIN_TIMESLICE max(5 * HZ / 1000, 1)
#define DEF_TIMESLICE (100 * HZ / 1000)
-#define ON_RUNQUEUE_WEIGHT 30
-#define CHILD_PENALTY 95
-#define PARENT_PENALTY 100
-#define EXIT_WEIGHT 3
-#define PRIO_BONUS_RATIO 25
-#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
-#define INTERACTIVE_DELTA 2
-#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS)
-#define STARVATION_LIMIT (MAX_SLEEP_AVG)
-#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG))
-
-/*
- * If a task is 'interactive' then we reinsert it in the active
- * array after it has expired its current timeslice. (it will not
- * continue to run immediately, it will still roundrobin with
- * other interactive tasks.)
- *
- * This part scales the interactivity limit depending on niceness.
- *
- * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
- * Here are a few examples of different nice levels:
- *
- * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
- * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
- * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]
- * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
- * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
- *
- * (the X axis represents the possible -5 ... 0 ... +5 dynamic
- * priority range a task can explore, a value of '1' means the
- * task is rated interactive.)
- *
- * Ie. nice +19 tasks can never get 'interactive' enough to be
- * reinserted into the active array. And only heavily CPU-hog nice -20
- * tasks will be expired. Default nice 0 tasks are somewhere between,
- * it takes some effort for them to get interactive, but it's not
- * too hard.
- */
-
-#define CURRENT_BONUS(p) \
- (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
- MAX_SLEEP_AVG)
-
-#define GRANULARITY (10 * HZ / 1000 ? : 1)
-
-#ifdef CONFIG_SMP
-#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
- (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
- num_online_cpus())
-#else
-#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
- (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
-#endif
-
-#define SCALE(v1,v1_max,v2_max) \
- (v1) * (v2_max) / (v1_max)
-
-#define DELTA(p) \
- (SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
- INTERACTIVE_DELTA)
-
-#define TASK_INTERACTIVE(p) \
- ((p)->prio <= (p)->static_prio - DELTA(p))
-
-#define INTERACTIVE_SLEEP(p) \
- (JIFFIES_TO_NS(MAX_SLEEP_AVG * \
- (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
-
-#define TASK_PREEMPTS_CURR(p, rq) \
- ((p)->prio < (rq)->curr->prio)
-
-#define SCALE_PRIO(x, prio) \
- max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
-
-static unsigned int static_prio_timeslice(int static_prio)
-{
- if (static_prio < NICE_TO_PRIO(0))
- return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
- else
- return SCALE_PRIO(DEF_TIMESLICE, static_prio);
-}
#ifdef CONFIG_SMP
/*
}
#endif
+#define SCALE_PRIO(x, prio) \
+ max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
+
/*
- * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
+ * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
* to time slice values: [800ms ... 100ms ... 5ms]
- *
- * The higher a thread's priority, the bigger timeslices
- * it gets during one round of execution. But even the lowest
- * priority thread gets MIN_TIMESLICE worth of execution time.
*/
-
-static inline unsigned int task_timeslice(struct task_struct *p)
+static unsigned int static_prio_timeslice(int static_prio)
{
- return static_prio_timeslice(p->static_prio);
+ if (static_prio == NICE_TO_PRIO(19))
+ return 1;
+
+ if (static_prio < NICE_TO_PRIO(0))
+ return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
+ else
+ return SCALE_PRIO(DEF_TIMESLICE, static_prio);
}
static inline int rt_policy(int policy)
struct list_head *rt_load_balance_head, *rt_load_balance_curr;
};
-/*
- * The prio-array type of the old scheduler:
- */
-struct prio_array {
- unsigned int nr_active;
- DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
- struct list_head queue[MAX_PRIO];
-};
-
/*
* This is the main, per-CPU runqueue data structure.
*
* remote CPUs use both these fields when doing load calculation.
*/
unsigned long nr_running;
- unsigned long raw_weighted_load;
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
unsigned char idle_at_tick;
*/
unsigned long nr_uninterruptible;
- unsigned long expired_timestamp;
- unsigned long long most_recent_timestamp;
-
struct task_struct *curr, *idle;
unsigned long next_balance;
struct mm_struct *prev_mm;
- struct prio_array *active, *expired, arrays[2];
- int best_expired_prio;
-
u64 clock, prev_clock_raw;
s64 clock_max_delta;
struct lock_class_key rq_lock_key;
};
-static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
static DEFINE_MUTEX(sched_hotcpu_mutex);
static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
#define task_rq(p) cpu_rq(task_cpu(p))
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
+/*
+ * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
+ * clock constructed from sched_clock():
+ */
+unsigned long long cpu_clock(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long long now;
+ unsigned long flags;
+
+ spin_lock_irqsave(&rq->lock, flags);
+ now = rq_clock(rq);
+ spin_unlock_irqrestore(&rq->lock, flags);
+
+ return now;
+}
+
#ifdef CONFIG_FAIR_GROUP_SCHED
/* Change a task's ->cfs_rq if it moves across CPUs */
static inline void set_task_cfs_rq(struct task_struct *p)
*
* The "10% effect" is relative and cumulative: from _any_ nice level,
* if you go up 1 level, it's -10% CPU usage, if you go down 1 level
- * it's +10% CPU usage.
+ * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
+ * If a task goes up by ~10% and another task goes down by ~10% then
+ * the relative distance between them is ~25%.)
*/
static const int prio_to_weight[40] = {
/* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921,
/* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15,
};
+/*
+ * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
+ *
+ * In cases where the weight does not change often, we can use the
+ * precalculated inverse to speed up arithmetics by turning divisions
+ * into multiplications:
+ */
static const u32 prio_to_wmult[40] = {
- 48356, 60446, 75558, 94446, 118058, 147573,
- 184467, 230589, 288233, 360285, 450347,
- 562979, 703746, 879575, 1099582, 1374389,
- 717986, 2147483, 2684354, 3355443, 4194304,
- 244160, 6557201, 8196502, 10250518, 12782640,
- 16025997, 19976592, 24970740, 31350126, 39045157,
- 49367440, 61356675, 76695844, 95443717, 119304647,
- 148102320, 186737708, 238609294, 286331153,
+/* -20 */ 48356, 60446, 75558, 94446, 118058,
+/* -15 */ 147573, 184467, 230589, 288233, 360285,
+/* -10 */ 450347, 562979, 703746, 879575, 1099582,
+/* -5 */ 1374389, 1717986, 2147483, 2684354, 3355443,
+/* 0 */ 4194304, 5244160, 6557201, 8196502, 10250518,
+/* 5 */ 12782640, 16025997, 19976592, 24970740, 31350126,
+/* 10 */ 39045157, 49367440, 61356675, 76695844, 95443717,
+/* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
};
static inline void
struct sched_domain *tmp, *sd = NULL;
for_each_domain(cpu, tmp) {
- /*
- * If power savings logic is enabled for a domain, stop there.
- */
+ /*
+ * If power savings logic is enabled for a domain, stop there.
+ */
if (tmp->flags & SD_POWERSAVINGS_BALANCE)
break;
if (tmp->flags & flag)
if (idle_cpu(i))
return i;
}
- }
- else
+ } else {
break;
+ }
}
return cpu;
}
/*
* Remove function-return probe instances associated with this
* task and put them back on the free list.
- */
+ */
kprobe_flush_task(prev);
put_task_struct(prev);
}
rq = cpu_rq(i);
- if (*sd_idle && !idle_cpu(i))
+ if (*sd_idle && rq->nr_running)
*sd_idle = 0;
/* Bias balancing toward cpus of our domain */
/*
* First idle cpu or the first cpu(busiest) in this sched group
* is eligible for doing load balancing at this and above
- * domains.
+ * domains. In the newly idle case, we will allow all the cpu's
+ * to do the newly idle load balance.
*/
- if (local_group && balance_cpu != this_cpu && balance) {
+ if (idle != CPU_NEWLY_IDLE && local_group &&
+ balance_cpu != this_cpu && balance) {
*balance = 0;
goto ret;
}
unsigned long imbalance;
int nr_moved = 0;
int sd_idle = 0;
+ int all_pinned = 0;
cpumask_t cpus = CPU_MASK_ALL;
/*
double_lock_balance(this_rq, busiest);
nr_moved = move_tasks(this_rq, this_cpu, busiest,
minus_1_or_zero(busiest->nr_running),
- imbalance, sd, CPU_NEWLY_IDLE, NULL);
+ imbalance, sd, CPU_NEWLY_IDLE,
+ &all_pinned);
spin_unlock(&busiest->lock);
- if (!nr_moved) {
+ if (unlikely(all_pinned)) {
cpu_clear(cpu_of(busiest), cpus);
if (!cpus_empty(cpus))
goto redo;
}
EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
-
-#define SLEEP_ON_VAR \
- unsigned long flags; \
- wait_queue_t wait; \
- init_waitqueue_entry(&wait, current);
-
-#define SLEEP_ON_HEAD \
- spin_lock_irqsave(&q->lock,flags); \
- __add_wait_queue(q, &wait); \
+static inline void
+sleep_on_head(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
+{
+ spin_lock_irqsave(&q->lock, *flags);
+ __add_wait_queue(q, wait);
spin_unlock(&q->lock);
+}
-#define SLEEP_ON_TAIL \
- spin_lock_irq(&q->lock); \
- __remove_wait_queue(q, &wait); \
- spin_unlock_irqrestore(&q->lock, flags);
+static inline void
+sleep_on_tail(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
+{
+ spin_lock_irq(&q->lock);
+ __remove_wait_queue(q, wait);
+ spin_unlock_irqrestore(&q->lock, *flags);
+}
-void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
+void __sched interruptible_sleep_on(wait_queue_head_t *q)
{
- SLEEP_ON_VAR
+ unsigned long flags;
+ wait_queue_t wait;
+
+ init_waitqueue_entry(&wait, current);
current->state = TASK_INTERRUPTIBLE;
- SLEEP_ON_HEAD
+ sleep_on_head(q, &wait, &flags);
schedule();
- SLEEP_ON_TAIL
+ sleep_on_tail(q, &wait, &flags);
}
EXPORT_SYMBOL(interruptible_sleep_on);
-long fastcall __sched
+long __sched
interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
{
- SLEEP_ON_VAR
+ unsigned long flags;
+ wait_queue_t wait;
+
+ init_waitqueue_entry(&wait, current);
current->state = TASK_INTERRUPTIBLE;
- SLEEP_ON_HEAD
+ sleep_on_head(q, &wait, &flags);
timeout = schedule_timeout(timeout);
- SLEEP_ON_TAIL
+ sleep_on_tail(q, &wait, &flags);
return timeout;
}
EXPORT_SYMBOL(interruptible_sleep_on_timeout);
-void fastcall __sched sleep_on(wait_queue_head_t *q)
+void __sched sleep_on(wait_queue_head_t *q)
{
- SLEEP_ON_VAR
+ unsigned long flags;
+ wait_queue_t wait;
+
+ init_waitqueue_entry(&wait, current);
current->state = TASK_UNINTERRUPTIBLE;
- SLEEP_ON_HEAD
+ sleep_on_head(q, &wait, &flags);
schedule();
- SLEEP_ON_TAIL
+ sleep_on_tail(q, &wait, &flags);
}
EXPORT_SYMBOL(sleep_on);
-long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
+long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
{
- SLEEP_ON_VAR
+ unsigned long flags;
+ wait_queue_t wait;
+
+ init_waitqueue_entry(&wait, current);
current->state = TASK_UNINTERRUPTIBLE;
- SLEEP_ON_HEAD
+ sleep_on_head(q, &wait, &flags);
timeout = schedule_timeout(timeout);
- SLEEP_ON_TAIL
+ sleep_on_tail(q, &wait, &flags);
return timeout;
}
-
EXPORT_SYMBOL(sleep_on_timeout);
#ifdef CONFIG_RT_MUTEXES
state = p->state ? __ffs(p->state) + 1 : 0;
printk("%-13.13s %c", p->comm,
state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
-#if (BITS_PER_LONG == 32)
+#if BITS_PER_LONG == 32
if (state == TASK_RUNNING)
- printk(" running ");
+ printk(" running ");
else
- printk(" %08lX ", thread_saved_pc(p));
+ printk(" %08lx ", thread_saved_pc(p));
#else
if (state == TASK_RUNNING)
- printk(" running task ");
+ printk(" running task ");
else
printk(" %016lx ", thread_saved_pc(p));
#endif
free = (unsigned long)n - (unsigned long)end_of_stack(p);
}
#endif
- printk("%5lu %5d %6d", free, p->pid, p->parent->pid);
- if (!p->mm)
- printk(" (L-TLB)\n");
- else
- printk(" (NOTLB)\n");
+ printk("%5lu %5d %6d\n", free, p->pid, p->parent->pid);
if (state != TASK_RUNNING)
show_stack(p, NULL);
{
struct task_struct *g, *p;
-#if (BITS_PER_LONG == 32)
- printk("\n"
- " free sibling\n");
- printk(" task PC stack pid father child younger older\n");
+#if BITS_PER_LONG == 32
+ printk(KERN_INFO
+ " task PC stack pid father\n");
#else
- printk("\n"
- " free sibling\n");
- printk(" task PC stack pid father child younger older\n");
+ printk(KERN_INFO
+ " task PC stack pid father\n");
#endif
read_lock(&tasklist_lock);
do_each_thread(g, p) {
static inline void sched_init_granularity(void)
{
unsigned int factor = 1 + ilog2(num_online_cpus());
- const unsigned long gran_limit = 10000000;
+ const unsigned long gran_limit = 100000000;
sysctl_sched_granularity *= factor;
if (sysctl_sched_granularity > gran_limit)
struct migration_req *req;
struct list_head *head;
- try_to_freeze();
-
spin_lock_irq(&rq->lock);
if (cpu_is_offline(cpu)) {
p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
if (IS_ERR(p))
return NOTIFY_BAD;
- p->flags |= PF_NOFREEZE;
kthread_bind(p, cpu);
/* Must be high prio: stop_machine expects to yield to it. */
rq = task_rq_lock(p, &flags);
sched_group_nodes[i] = sg;
for_each_cpu_mask(j, nodemask) {
struct sched_domain *sd;
+
sd = &per_cpu(node_domains, j);
sd->groups = sg;
}