}
/*
- * find_idlest_queue - find the idlest runqueue among the cpus in group.
+ * find_idlest_cpu - find the idlest cpu among the cpus in group.
*/
static int
find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
while (sd) {
cpumask_t span;
struct sched_group *group;
- int new_cpu;
- int weight;
+ int new_cpu, weight;
+
+ if (!(sd->flags & flag)) {
+ sd = sd->child;
+ continue;
+ }
span = sd->span;
group = find_idlest_group(sd, t, cpu);
- if (!group)
- goto nextlevel;
+ if (!group) {
+ sd = sd->child;
+ continue;
+ }
new_cpu = find_idlest_cpu(group, t, cpu);
- if (new_cpu == -1 || new_cpu == cpu)
- goto nextlevel;
+ if (new_cpu == -1 || new_cpu == cpu) {
+ /* Now try balancing at a lower domain level of cpu */
+ sd = sd->child;
+ continue;
+ }
- /* Now try balancing at a lower domain level */
+ /* Now try balancing at a lower domain level of new_cpu */
cpu = new_cpu;
-nextlevel:
sd = NULL;
weight = cpus_weight(span);
for_each_domain(cpu, tmp) {
struct mm_struct *mm = next->mm;
struct mm_struct *oldmm = prev->active_mm;
- if (unlikely(!mm)) {
+ if (!mm) {
next->active_mm = oldmm;
atomic_inc(&oldmm->mm_count);
enter_lazy_tlb(oldmm, next);
} else
switch_mm(oldmm, mm, next);
- if (unlikely(!prev->mm)) {
+ if (!prev->mm) {
prev->active_mm = NULL;
WARN_ON(rq->prev_mm);
rq->prev_mm = oldmm;
struct rq *busiest;
cpumask_t cpus = CPU_MASK_ALL;
+ /*
+ * When power savings policy is enabled for the parent domain, idle
+ * sibling can pick up load irrespective of busy siblings. In this case,
+ * let the state of idle sibling percolate up as IDLE, instead of
+ * portraying it as NOT_IDLE.
+ */
if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
- !sched_smt_power_savings)
+ !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
sd_idle = 1;
schedstat_inc(sd, lb_cnt[idle]);
}
if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
- !sched_smt_power_savings)
+ !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
return -1;
return nr_moved;
sd->balance_interval *= 2;
if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
- !sched_smt_power_savings)
+ !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
return -1;
return 0;
}
int sd_idle = 0;
cpumask_t cpus = CPU_MASK_ALL;
- if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
+ /*
+ * When power savings policy is enabled for the parent domain, idle
+ * sibling can pick up load irrespective of busy siblings. In this case,
+ * let the state of idle sibling percolate up as IDLE, instead of
+ * portraying it as NOT_IDLE.
+ */
+ if (sd->flags & SD_SHARE_CPUPOWER &&
+ !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
sd_idle = 1;
schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
if (!nr_moved) {
schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
- if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+ if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+ !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
return -1;
} else
sd->nr_balance_failed = 0;
out_balanced:
schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
- !sched_smt_power_savings)
+ !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
return -1;
sd->nr_balance_failed = 0;
* If there is a non-zero preempt_count or interrupts are disabled,
* we do not want to preempt the current task. Just return..
*/
- if (unlikely(ti->preempt_count || irqs_disabled()))
+ if (likely(ti->preempt_count || irqs_disabled()))
return;
need_resched:
if (sd->flags & (SD_LOAD_BALANCE |
SD_BALANCE_NEWIDLE |
SD_BALANCE_FORK |
- SD_BALANCE_EXEC)) {
+ SD_BALANCE_EXEC |
+ SD_SHARE_CPUPOWER |
+ SD_SHARE_PKG_RESOURCES)) {
if (sd->groups != sd->groups->next)
return 0;
}
pflags &= ~(SD_LOAD_BALANCE |
SD_BALANCE_NEWIDLE |
SD_BALANCE_FORK |
- SD_BALANCE_EXEC);
+ SD_BALANCE_EXEC |
+ SD_SHARE_CPUPOWER |
+ SD_SHARE_PKG_RESOURCES);
}
if (~cflags & pflags)
return 0;
struct sched_domain *parent = tmp->parent;
if (!parent)
break;
- if (sd_parent_degenerate(tmp, parent))
+ if (sd_parent_degenerate(tmp, parent)) {
tmp->parent = parent->parent;
+ if (parent->parent)
+ parent->parent->child = tmp;
+ }
}
- if (sd && sd_degenerate(sd))
+ if (sd && sd_degenerate(sd)) {
sd = sd->parent;
+ if (sd)
+ sd->child = NULL;
+ }
sched_domain_debug(sd, cpu);
}
#endif
+/*
+ * Initialize sched groups cpu_power.
+ *
+ * cpu_power indicates the capacity of sched group, which is used while
+ * distributing the load between different sched groups in a sched domain.
+ * Typically cpu_power for all the groups in a sched domain will be same unless
+ * there are asymmetries in the topology. If there are asymmetries, group
+ * having more cpu_power will pickup more load compared to the group having
+ * less cpu_power.
+ *
+ * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
+ * the maximum number of tasks a group can handle in the presence of other idle
+ * or lightly loaded groups in the same sched domain.
+ */
+static void init_sched_groups_power(int cpu, struct sched_domain *sd)
+{
+ struct sched_domain *child;
+ struct sched_group *group;
+
+ WARN_ON(!sd || !sd->groups);
+
+ if (cpu != first_cpu(sd->groups->cpumask))
+ return;
+
+ child = sd->child;
+
+ /*
+ * For perf policy, if the groups in child domain share resources
+ * (for example cores sharing some portions of the cache hierarchy
+ * or SMT), then set this domain groups cpu_power such that each group
+ * can handle only one task, when there are other idle groups in the
+ * same sched domain.
+ */
+ if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
+ (child->flags &
+ (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
+ sd->groups->cpu_power = SCHED_LOAD_SCALE;
+ return;
+ }
+
+ sd->groups->cpu_power = 0;
+
+ /*
+ * add cpu_power of each child group to this groups cpu_power
+ */
+ group = child->groups;
+ do {
+ sd->groups->cpu_power += group->cpu_power;
+ group = group->next;
+ } while (group != child->groups);
+}
+
/*
* Build sched domains for a given set of cpus and attach the sched domains
* to the individual cpus
static int build_sched_domains(const cpumask_t *cpu_map)
{
int i;
+ struct sched_domain *sd;
#ifdef CONFIG_NUMA
struct sched_group **sched_group_nodes = NULL;
struct sched_group *sched_group_allnodes = NULL;
> SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
if (!sched_group_allnodes) {
sched_group_allnodes
- = kmalloc(sizeof(struct sched_group)
- * MAX_NUMNODES,
- GFP_KERNEL);
+ = kmalloc_node(sizeof(struct sched_group)
+ * MAX_NUMNODES,
+ GFP_KERNEL,
+ cpu_to_node(i));
if (!sched_group_allnodes) {
printk(KERN_WARNING
"Can not alloc allnodes sched group\n");
*sd = SD_NODE_INIT;
sd->span = sched_domain_node_span(cpu_to_node(i));
sd->parent = p;
+ if (p)
+ p->child = sd;
cpus_and(sd->span, sd->span, *cpu_map);
#endif
*sd = SD_CPU_INIT;
sd->span = nodemask;
sd->parent = p;
+ if (p)
+ p->child = sd;
sd->groups = &sched_group_phys[group];
#ifdef CONFIG_SCHED_MC
sd->span = cpu_coregroup_map(i);
cpus_and(sd->span, sd->span, *cpu_map);
sd->parent = p;
+ p->child = sd;
sd->groups = &sched_group_core[group];
#endif
sd->span = cpu_sibling_map[i];
cpus_and(sd->span, sd->span, *cpu_map);
sd->parent = p;
+ p->child = sd;
sd->groups = &sched_group_cpus[group];
#endif
}
/* Calculate CPU power for physical packages and nodes */
#ifdef CONFIG_SCHED_SMT
for_each_cpu_mask(i, *cpu_map) {
- struct sched_domain *sd;
sd = &per_cpu(cpu_domains, i);
- sd->groups->cpu_power = SCHED_LOAD_SCALE;
+ init_sched_groups_power(i, sd);
}
#endif
#ifdef CONFIG_SCHED_MC
for_each_cpu_mask(i, *cpu_map) {
- int power;
- struct sched_domain *sd;
sd = &per_cpu(core_domains, i);
- if (sched_smt_power_savings)
- power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
- else
- power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
- * SCHED_LOAD_SCALE / 10;
- sd->groups->cpu_power = power;
+ init_sched_groups_power(i, sd);
}
#endif
for_each_cpu_mask(i, *cpu_map) {
- struct sched_domain *sd;
-#ifdef CONFIG_SCHED_MC
sd = &per_cpu(phys_domains, i);
- if (i != first_cpu(sd->groups->cpumask))
- continue;
-
- sd->groups->cpu_power = 0;
- if (sched_mc_power_savings || sched_smt_power_savings) {
- int j;
-
- for_each_cpu_mask(j, sd->groups->cpumask) {
- struct sched_domain *sd1;
- sd1 = &per_cpu(core_domains, j);
- /*
- * for each core we will add once
- * to the group in physical domain
- */
- if (j != first_cpu(sd1->groups->cpumask))
- continue;
-
- if (sched_smt_power_savings)
- sd->groups->cpu_power += sd1->groups->cpu_power;
- else
- sd->groups->cpu_power += SCHED_LOAD_SCALE;
- }
- } else
- /*
- * This has to be < 2 * SCHED_LOAD_SCALE
- * Lets keep it SCHED_LOAD_SCALE, so that
- * while calculating NUMA group's cpu_power
- * we can simply do
- * numa_group->cpu_power += phys_group->cpu_power;
- *
- * See "only add power once for each physical pkg"
- * comment below
- */
- sd->groups->cpu_power = SCHED_LOAD_SCALE;
-#else
- int power;
- sd = &per_cpu(phys_domains, i);
- if (sched_smt_power_savings)
- power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
- else
- power = SCHED_LOAD_SCALE;
- sd->groups->cpu_power = power;
-#endif
+ init_sched_groups_power(i, sd);
}
#ifdef CONFIG_NUMA