+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+/*
+ * Group load balancing.
+ *
+ * We calculate a few balance domain wide aggregate numbers; load and weight.
+ * Given the pictures below, and assuming each item has equal weight:
+ *
+ * root 1 - thread
+ * / | \ A - group
+ * A 1 B
+ * /|\ / \
+ * C 2 D 3 4
+ * | |
+ * 5 6
+ *
+ * load:
+ * A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd,
+ * which equals 1/9-th of the total load.
+ *
+ * shares:
+ * The weight of this group on the selected cpus.
+ *
+ * rq_weight:
+ * Direct sum of all the cpu's their rq weight, e.g. A would get 3 while
+ * B would get 2.
+ *
+ * task_weight:
+ * Part of the rq_weight contributed by tasks; all groups except B would
+ * get 1, B gets 2.
+ */
+
+static inline struct aggregate_struct *
+aggregate(struct task_group *tg, struct sched_domain *sd)
+{
+ return &tg->cfs_rq[sd->first_cpu]->aggregate;
+}
+
+typedef void (*aggregate_func)(struct task_group *, struct sched_domain *);
+
+/*
+ * Iterate the full tree, calling @down when first entering a node and @up when
+ * leaving it for the final time.
+ */
+static
+void aggregate_walk_tree(aggregate_func down, aggregate_func up,
+ struct sched_domain *sd)
+{
+ struct task_group *parent, *child;
+
+ rcu_read_lock();
+ parent = &root_task_group;
+down:
+ (*down)(parent, sd);
+ list_for_each_entry_rcu(child, &parent->children, siblings) {
+ parent = child;
+ goto down;
+
+up:
+ continue;
+ }
+ (*up)(parent, sd);
+
+ child = parent;
+ parent = parent->parent;
+ if (parent)
+ goto up;
+ rcu_read_unlock();
+}
+
+/*
+ * Calculate the aggregate runqueue weight.
+ */
+static
+void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd)
+{
+ unsigned long rq_weight = 0;
+ unsigned long task_weight = 0;
+ int i;
+
+ for_each_cpu_mask(i, sd->span) {
+ rq_weight += tg->cfs_rq[i]->load.weight;
+ task_weight += tg->cfs_rq[i]->task_weight;
+ }
+
+ aggregate(tg, sd)->rq_weight = rq_weight;
+ aggregate(tg, sd)->task_weight = task_weight;
+}
+
+/*
+ * Compute the weight of this group on the given cpus.
+ */
+static
+void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd)
+{
+ unsigned long shares = 0;
+ int i;
+
+ for_each_cpu_mask(i, sd->span)
+ shares += tg->cfs_rq[i]->shares;
+
+ if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares)
+ shares = tg->shares;
+
+ aggregate(tg, sd)->shares = shares;
+}
+
+/*
+ * Compute the load fraction assigned to this group, relies on the aggregate
+ * weight and this group's parent's load, i.e. top-down.
+ */
+static
+void aggregate_group_load(struct task_group *tg, struct sched_domain *sd)
+{
+ unsigned long load;
+
+ if (!tg->parent) {
+ int i;
+
+ load = 0;
+ for_each_cpu_mask(i, sd->span)
+ load += cpu_rq(i)->load.weight;
+
+ } else {
+ load = aggregate(tg->parent, sd)->load;
+
+ /*
+ * shares is our weight in the parent's rq so
+ * shares/parent->rq_weight gives our fraction of the load
+ */
+ load *= aggregate(tg, sd)->shares;
+ load /= aggregate(tg->parent, sd)->rq_weight + 1;
+ }
+
+ aggregate(tg, sd)->load = load;
+}
+
+static void __set_se_shares(struct sched_entity *se, unsigned long shares);
+
+/*
+ * Calculate and set the cpu's group shares.
+ */
+static void
+__update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
+ int tcpu)
+{
+ int boost = 0;
+ unsigned long shares;
+ unsigned long rq_weight;
+
+ if (!tg->se[tcpu])
+ return;
+
+ rq_weight = tg->cfs_rq[tcpu]->load.weight;
+
+ /*
+ * If there are currently no tasks on the cpu pretend there is one of
+ * average load so that when a new task gets to run here it will not
+ * get delayed by group starvation.
+ */
+ if (!rq_weight) {
+ boost = 1;
+ rq_weight = NICE_0_LOAD;
+ }
+
+ /*
+ * \Sum shares * rq_weight
+ * shares = -----------------------
+ * \Sum rq_weight
+ *
+ */
+ shares = aggregate(tg, sd)->shares * rq_weight;
+ shares /= aggregate(tg, sd)->rq_weight + 1;
+
+ /*
+ * record the actual number of shares, not the boosted amount.
+ */
+ tg->cfs_rq[tcpu]->shares = boost ? 0 : shares;
+
+ if (shares < MIN_SHARES)
+ shares = MIN_SHARES;
+
+ __set_se_shares(tg->se[tcpu], shares);
+}
+
+/*
+ * Re-adjust the weights on the cpu the task came from and on the cpu the
+ * task went to.
+ */
+static void
+__move_group_shares(struct task_group *tg, struct sched_domain *sd,
+ int scpu, int dcpu)
+{
+ unsigned long shares;
+
+ shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
+
+ __update_group_shares_cpu(tg, sd, scpu);
+ __update_group_shares_cpu(tg, sd, dcpu);
+
+ /*
+ * ensure we never loose shares due to rounding errors in the
+ * above redistribution.
+ */
+ shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
+ if (shares)
+ tg->cfs_rq[dcpu]->shares += shares;
+}
+
+/*
+ * Because changing a group's shares changes the weight of the super-group
+ * we need to walk up the tree and change all shares until we hit the root.
+ */
+static void
+move_group_shares(struct task_group *tg, struct sched_domain *sd,
+ int scpu, int dcpu)
+{
+ while (tg) {
+ __move_group_shares(tg, sd, scpu, dcpu);
+ tg = tg->parent;
+ }
+}
+
+static
+void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd)
+{
+ unsigned long shares = aggregate(tg, sd)->shares;
+ int i;
+
+ for_each_cpu_mask(i, sd->span) {
+ struct rq *rq = cpu_rq(i);
+ unsigned long flags;
+
+ spin_lock_irqsave(&rq->lock, flags);
+ __update_group_shares_cpu(tg, sd, i);
+ spin_unlock_irqrestore(&rq->lock, flags);
+ }
+
+ aggregate_group_shares(tg, sd);
+
+ /*
+ * ensure we never loose shares due to rounding errors in the
+ * above redistribution.
+ */
+ shares -= aggregate(tg, sd)->shares;
+ if (shares) {
+ tg->cfs_rq[sd->first_cpu]->shares += shares;
+ aggregate(tg, sd)->shares += shares;
+ }
+}
+
+/*
+ * Calculate the accumulative weight and recursive load of each task group
+ * while walking down the tree.
+ */
+static
+void aggregate_get_down(struct task_group *tg, struct sched_domain *sd)
+{
+ aggregate_group_weight(tg, sd);
+ aggregate_group_shares(tg, sd);
+ aggregate_group_load(tg, sd);
+}
+
+/*
+ * Rebalance the cpu shares while walking back up the tree.
+ */
+static
+void aggregate_get_up(struct task_group *tg, struct sched_domain *sd)
+{
+ aggregate_group_set_shares(tg, sd);
+}
+
+static DEFINE_PER_CPU(spinlock_t, aggregate_lock);
+
+static void __init init_aggregate(void)
+{
+ int i;
+
+ for_each_possible_cpu(i)
+ spin_lock_init(&per_cpu(aggregate_lock, i));
+}
+
+static int get_aggregate(struct sched_domain *sd)
+{
+ if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu)))
+ return 0;
+
+ aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd);
+ return 1;
+}
+
+static void put_aggregate(struct sched_domain *sd)
+{
+ spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu));
+}
+
+static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
+{
+ cfs_rq->shares = shares;
+}
+
+#else
+
+static inline void init_aggregate(void)
+{
+}
+
+static inline int get_aggregate(struct sched_domain *sd)
+{
+ return 0;
+}
+
+static inline void put_aggregate(struct sched_domain *sd)
+{
+}
+#endif
+
+#else /* CONFIG_SMP */
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
+{
+}
+#endif
+