err.no Git - linux-2.6/blob - kernel/sched_fair.c

   1 /*
   2  * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
   3  *
   4  *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
   5  *
   6  *  Interactivity improvements by Mike Galbraith
   7  *  (C) 2007 Mike Galbraith <efault@gmx.de>
   8  *
   9  *  Various enhancements by Dmitry Adamushko.
  10  *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
  11  *
  12  *  Group scheduling enhancements by Srivatsa Vaddagiri
  13  *  Copyright IBM Corporation, 2007
  14  *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
  15  *
  16  *  Scaled math optimizations by Thomas Gleixner
  17  *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
  18  *
  19  *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
  20  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  21  */
  22
  23 /*
  24  * Targeted preemption latency for CPU-bound tasks:
  25  * (default: 20ms, units: nanoseconds)
  26  *
  27  * NOTE: this latency value is not the same as the concept of
  28  * 'timeslice length' - timeslices in CFS are of variable length.
  29  * (to see the precise effective timeslice length of your workload,
  30  *  run vmstat and monitor the context-switches field)
  31  *
  32  * On SMP systems the value of this is multiplied by the log2 of the
  33  * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way
  34  * systems, 4x on 8-way systems, 5x on 16-way systems, etc.)
  35  * Targeted preemption latency for CPU-bound tasks:
  36  */
  37 const_debug unsigned int sysctl_sched_latency = 20000000ULL;
  38
  39 /*
  40  * After fork, child runs first. (default) If set to 0 then
  41  * parent will (try to) run first.
  42  */
  43 const_debug unsigned int sysctl_sched_child_runs_first = 1;
  44
  45 /*
  46  * Minimal preemption granularity for CPU-bound tasks:
  47  * (default: 2 msec, units: nanoseconds)
  48  */
  49 const_debug unsigned int sysctl_sched_nr_latency = 20;
  50
  51 /*
  52  * sys_sched_yield() compat mode
  53  *
  54  * This option switches the agressive yield implementation of the
  55  * old scheduler back on.
  56  */
  57 unsigned int __read_mostly sysctl_sched_compat_yield;
  58
  59 /*
  60  * SCHED_BATCH wake-up granularity.
  61  * (default: 25 msec, units: nanoseconds)
  62  *
  63  * This option delays the preemption effects of decoupled workloads
  64  * and reduces their over-scheduling. Synchronous workloads will still
  65  * have immediate wakeup/sleep latencies.
  66  */
  67 const_debug unsigned int sysctl_sched_batch_wakeup_granularity = 25000000UL;
  68
  69 /*
  70  * SCHED_OTHER wake-up granularity.
  71  * (default: 1 msec, units: nanoseconds)
  72  *
  73  * This option delays the preemption effects of decoupled workloads
  74  * and reduces their over-scheduling. Synchronous workloads will still
  75  * have immediate wakeup/sleep latencies.
  76  */
  77 const_debug unsigned int sysctl_sched_wakeup_granularity = 2000000UL;
  78
  79 extern struct sched_class fair_sched_class;
  80
  81 /**************************************************************
  82  * CFS operations on generic schedulable entities:
  83  */
  84
  85 #ifdef CONFIG_FAIR_GROUP_SCHED
  86
  87 /* cpu runqueue to which this cfs_rq is attached */
  88 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
  89 {
  90         return cfs_rq->rq;
  91 }
  92
  93 /* An entity is a task if it doesn't "own" a runqueue */
  94 #define entity_is_task(se)      (!se->my_q)
  95
  96 #else   /* CONFIG_FAIR_GROUP_SCHED */
  97
  98 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
  99 {
 100         return container_of(cfs_rq, struct rq, cfs);
 101 }
 102
 103 #define entity_is_task(se)      1
 104
 105 #endif  /* CONFIG_FAIR_GROUP_SCHED */
 106
 107 static inline struct task_struct *task_of(struct sched_entity *se)
 108 {
 109         return container_of(se, struct task_struct, se);
 110 }
 111
 112
 113 /**************************************************************
 114  * Scheduling class tree data structure manipulation methods:
 115  */
 116
 117 static inline u64
 118 max_vruntime(u64 min_vruntime, u64 vruntime)
 119 {
 120         s64 delta = (s64)(vruntime - min_vruntime);
 121         if (delta > 0)
 122                 min_vruntime = vruntime;
 123
 124         return min_vruntime;
 125 }
 126
 127 static inline u64
 128 min_vruntime(u64 min_vruntime, u64 vruntime)
 129 {
 130         s64 delta = (s64)(vruntime - min_vruntime);
 131         if (delta < 0)
 132                 min_vruntime = vruntime;
 133
 134         return min_vruntime;
 135 }
 136
 137 static inline s64
 138 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
 139 {
 140         return se->vruntime - cfs_rq->min_vruntime;
 141 }
 142
 143 /*
 144  * Enqueue an entity into the rb-tree:
 145  */
 146 static void
 147 __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 148 {
 149         struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
 150         struct rb_node *parent = NULL;
 151         struct sched_entity *entry;
 152         s64 key = entity_key(cfs_rq, se);
 153         int leftmost = 1;
 154
 155         /*
 156          * Find the right place in the rbtree:
 157          */
 158         while (*link) {
 159                 parent = *link;
 160                 entry = rb_entry(parent, struct sched_entity, run_node);
 161                 /*
 162                  * We dont care about collisions. Nodes with
 163                  * the same key stay together.
 164                  */
 165                 if (key < entity_key(cfs_rq, entry)) {
 166                         link = &parent->rb_left;
 167                 } else {
 168                         link = &parent->rb_right;
 169                         leftmost = 0;
 170                 }
 171         }
 172
 173         /*
 174          * Maintain a cache of leftmost tree entries (it is frequently
 175          * used):
 176          */
 177         if (leftmost)
 178                 cfs_rq->rb_leftmost = &se->run_node;
 179
 180         rb_link_node(&se->run_node, parent, link);
 181         rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
 182 }
 183
 184 static void
 185 __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 186 {
 187         if (cfs_rq->rb_leftmost == &se->run_node)
 188                 cfs_rq->rb_leftmost = rb_next(&se->run_node);
 189
 190         rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
 191 }
 192
 193 static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)
 194 {
 195         return cfs_rq->rb_leftmost;
 196 }
 197
 198 static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
 199 {
 200         return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node);
 201 }
 202
 203 static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 204 {
 205         struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
 206         struct sched_entity *se = NULL;
 207         struct rb_node *parent;
 208
 209         while (*link) {
 210                 parent = *link;
 211                 se = rb_entry(parent, struct sched_entity, run_node);
 212                 link = &parent->rb_right;
 213         }
 214
 215         return se;
 216 }
 217
 218 /**************************************************************
 219  * Scheduling class statistics methods:
 220  */
 221
 222 static u64 __sched_period(unsigned long nr_running)
 223 {
 224         u64 period = sysctl_sched_latency;
 225         unsigned long nr_latency = sysctl_sched_nr_latency;
 226
 227         if (unlikely(nr_running > nr_latency)) {
 228                 period *= nr_running;
 229                 do_div(period, nr_latency);
 230         }
 231
 232         return period;
 233 }
 234
 235 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 236 {
 237         u64 period = __sched_period(cfs_rq->nr_running);
 238
 239         period *= se->load.weight;
 240         do_div(period, cfs_rq->load.weight);
 241
 242         return period;
 243 }
 244
 245 static u64 __sched_vslice(unsigned long nr_running)
 246 {
 247         unsigned long period = sysctl_sched_latency;
 248         unsigned long nr_latency = sysctl_sched_nr_latency;
 249
 250         if (unlikely(nr_running > nr_latency))
 251                 nr_running = nr_latency;
 252
 253         period /= nr_running;
 254
 255         return (u64)period;
 256 }
 257
 258 /*
 259  * Update the current task's runtime statistics. Skip current tasks that
 260  * are not in our scheduling class.
 261  */
 262 static inline void
 263 __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
 264               unsigned long delta_exec)
 265 {
 266         unsigned long delta_exec_weighted;
 267         u64 vruntime;
 268
 269         schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
 270
 271         curr->sum_exec_runtime += delta_exec;
 272         schedstat_add(cfs_rq, exec_clock, delta_exec);
 273         delta_exec_weighted = delta_exec;
 274         if (unlikely(curr->load.weight != NICE_0_LOAD)) {
 275                 delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
 276                                                         &curr->load);
 277         }
 278         curr->vruntime += delta_exec_weighted;
 279
 280         /*
 281          * maintain cfs_rq->min_vruntime to be a monotonic increasing
 282          * value tracking the leftmost vruntime in the tree.
 283          */
 284         if (first_fair(cfs_rq)) {
 285                 vruntime = min_vruntime(curr->vruntime,
 286                                 __pick_next_entity(cfs_rq)->vruntime);
 287         } else
 288                 vruntime = curr->vruntime;
 289
 290         cfs_rq->min_vruntime =
 291                 max_vruntime(cfs_rq->min_vruntime, vruntime);
 292 }
 293
 294 static void update_curr(struct cfs_rq *cfs_rq)
 295 {
 296         struct sched_entity *curr = cfs_rq->curr;
 297         u64 now = rq_of(cfs_rq)->clock;
 298         unsigned long delta_exec;
 299
 300         if (unlikely(!curr))
 301                 return;
 302
 303         /*
 304          * Get the amount of time the current task was running
 305          * since the last time we changed load (this cannot
 306          * overflow on 32 bits):
 307          */
 308         delta_exec = (unsigned long)(now - curr->exec_start);
 309
 310         __update_curr(cfs_rq, curr, delta_exec);
 311         curr->exec_start = now;
 312 }
 313
 314 static inline void
 315 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 316 {
 317         schedstat_set(se->wait_start, rq_of(cfs_rq)->clock);
 318 }
 319
 320 static inline unsigned long
 321 calc_weighted(unsigned long delta, struct sched_entity *se)
 322 {
 323         unsigned long weight = se->load.weight;
 324
 325         if (unlikely(weight != NICE_0_LOAD))
 326                 return (u64)delta * se->load.weight >> NICE_0_SHIFT;
 327         else
 328                 return delta;
 329 }
 330
 331 /*
 332  * Task is being enqueued - update stats:
 333  */
 334 static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 335 {
 336         /*
 337          * Are we enqueueing a waiting task? (for current tasks
 338          * a dequeue/enqueue event is a NOP)
 339          */
 340         if (se != cfs_rq->curr)
 341                 update_stats_wait_start(cfs_rq, se);
 342 }
 343
 344 static void
 345 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 346 {
 347         schedstat_set(se->wait_max, max(se->wait_max,
 348                         rq_of(cfs_rq)->clock - se->wait_start));
 349         schedstat_set(se->wait_start, 0);
 350 }
 351
 352 static inline void
 353 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 354 {
 355         update_curr(cfs_rq);
 356         /*
 357          * Mark the end of the wait period if dequeueing a
 358          * waiting task:
 359          */
 360         if (se != cfs_rq->curr)
 361                 update_stats_wait_end(cfs_rq, se);
 362 }
 363
 364 /*
 365  * We are picking a new current task - update its stats:
 366  */
 367 static inline void
 368 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 369 {
 370         /*
 371          * We are starting a new run period:
 372          */
 373         se->exec_start = rq_of(cfs_rq)->clock;
 374 }
 375
 376 /*
 377  * We are descheduling a task - update its stats:
 378  */
 379 static inline void
 380 update_stats_curr_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 381 {
 382         se->exec_start = 0;
 383 }
 384
 385 /**************************************************
 386  * Scheduling class queueing methods:
 387  */
 388
 389 static void
 390 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 391 {
 392         update_load_add(&cfs_rq->load, se->load.weight);
 393         cfs_rq->nr_running++;
 394         se->on_rq = 1;
 395 }
 396
 397 static void
 398 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 399 {
 400         update_load_sub(&cfs_rq->load, se->load.weight);
 401         cfs_rq->nr_running--;
 402         se->on_rq = 0;
 403 }
 404
 405 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 406 {
 407 #ifdef CONFIG_SCHEDSTATS
 408         if (se->sleep_start) {
 409                 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
 410
 411                 if ((s64)delta < 0)
 412                         delta = 0;
 413
 414                 if (unlikely(delta > se->sleep_max))
 415                         se->sleep_max = delta;
 416
 417                 se->sleep_start = 0;
 418                 se->sum_sleep_runtime += delta;
 419         }
 420         if (se->block_start) {
 421                 u64 delta = rq_of(cfs_rq)->clock - se->block_start;
 422
 423                 if ((s64)delta < 0)
 424                         delta = 0;
 425
 426                 if (unlikely(delta > se->block_max))
 427                         se->block_max = delta;
 428
 429                 se->block_start = 0;
 430                 se->sum_sleep_runtime += delta;
 431
 432                 /*
 433                  * Blocking time is in units of nanosecs, so shift by 20 to
 434                  * get a milliseconds-range estimation of the amount of
 435                  * time that the task spent sleeping:
 436                  */
 437                 if (unlikely(prof_on == SLEEP_PROFILING)) {
 438                         struct task_struct *tsk = task_of(se);
 439
 440                         profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk),
 441                                      delta >> 20);
 442                 }
 443         }
 444 #endif
 445 }
 446
 447 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
 448 {
 449 #ifdef CONFIG_SCHED_DEBUG
 450         s64 d = se->vruntime - cfs_rq->min_vruntime;
 451
 452         if (d < 0)
 453                 d = -d;
 454
 455         if (d > 3*sysctl_sched_latency)
 456                 schedstat_inc(cfs_rq, nr_spread_over);
 457 #endif
 458 }
 459
 460 static void
 461 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 462 {
 463         u64 vruntime;
 464
 465         vruntime = cfs_rq->min_vruntime;
 466
 467         if (sched_feat(USE_TREE_AVG)) {
 468                 struct sched_entity *last = __pick_last_entity(cfs_rq);
 469                 if (last) {
 470                         vruntime += last->vruntime;
 471                         vruntime >>= 1;
 472                 }
 473         } else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running)
 474                 vruntime += __sched_vslice(cfs_rq->nr_running)/2;
 475
 476         if (initial && sched_feat(START_DEBIT))
 477                 vruntime += __sched_vslice(cfs_rq->nr_running + 1);
 478
 479         if (!initial) {
 480                 if (sched_feat(NEW_FAIR_SLEEPERS))
 481                         vruntime -= sysctl_sched_latency;
 482
 483                 vruntime = max_t(s64, vruntime, se->vruntime);
 484         }
 485
 486         se->vruntime = vruntime;
 487
 488 }
 489
 490 static void
 491 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
 492 {
 493         /*
 494          * Update the fair clock.
 495          */
 496         update_curr(cfs_rq);
 497
 498         if (wakeup) {
 499                 place_entity(cfs_rq, se, 0);
 500                 enqueue_sleeper(cfs_rq, se);
 501         }
 502
 503         update_stats_enqueue(cfs_rq, se);
 504         check_spread(cfs_rq, se);
 505         if (se != cfs_rq->curr)
 506                 __enqueue_entity(cfs_rq, se);
 507         account_entity_enqueue(cfs_rq, se);
 508 }
 509
 510 static void
 511 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 512 {
 513         update_stats_dequeue(cfs_rq, se);
 514         if (sleep) {
 515 #ifdef CONFIG_SCHEDSTATS
 516                 if (entity_is_task(se)) {
 517                         struct task_struct *tsk = task_of(se);
 518
 519                         if (tsk->state & TASK_INTERRUPTIBLE)
 520                                 se->sleep_start = rq_of(cfs_rq)->clock;
 521                         if (tsk->state & TASK_UNINTERRUPTIBLE)
 522                                 se->block_start = rq_of(cfs_rq)->clock;
 523                 }
 524 #endif
 525         }
 526
 527         if (se != cfs_rq->curr)
 528                 __dequeue_entity(cfs_rq, se);
 529         account_entity_dequeue(cfs_rq, se);
 530 }
 531
 532 /*
 533  * Preempt the current task with a newly woken task if needed:
 534  */
 535 static void
 536 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 537 {
 538         unsigned long ideal_runtime, delta_exec;
 539
 540         ideal_runtime = sched_slice(cfs_rq, curr);
 541         delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
 542         if (delta_exec > ideal_runtime)
 543                 resched_task(rq_of(cfs_rq)->curr);
 544 }
 545
 546 static void
 547 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 548 {
 549         /* 'current' is not kept within the tree. */
 550         if (se->on_rq) {
 551                 /*
 552                  * Any task has to be enqueued before it get to execute on
 553                  * a CPU. So account for the time it spent waiting on the
 554                  * runqueue.
 555                  */
 556                 update_stats_wait_end(cfs_rq, se);
 557                 __dequeue_entity(cfs_rq, se);
 558         }
 559
 560         update_stats_curr_start(cfs_rq, se);
 561         cfs_rq->curr = se;
 562 #ifdef CONFIG_SCHEDSTATS
 563         /*
 564          * Track our maximum slice length, if the CPU's load is at
 565          * least twice that of our own weight (i.e. dont track it
 566          * when there are only lesser-weight tasks around):
 567          */
 568         if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
 569                 se->slice_max = max(se->slice_max,
 570                         se->sum_exec_runtime - se->prev_sum_exec_runtime);
 571         }
 572 #endif
 573         se->prev_sum_exec_runtime = se->sum_exec_runtime;
 574 }
 575
 576 static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
 577 {
 578         struct sched_entity *se = __pick_next_entity(cfs_rq);
 579
 580         set_next_entity(cfs_rq, se);
 581
 582         return se;
 583 }
 584
 585 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 586 {
 587         /*
 588          * If still on the runqueue then deactivate_task()
 589          * was not called and update_curr() has to be done:
 590          */
 591         if (prev->on_rq)
 592                 update_curr(cfs_rq);
 593
 594         update_stats_curr_end(cfs_rq, prev);
 595
 596         check_spread(cfs_rq, prev);
 597         if (prev->on_rq) {
 598                 update_stats_wait_start(cfs_rq, prev);
 599                 /* Put 'current' back into the tree. */
 600                 __enqueue_entity(cfs_rq, prev);
 601         }
 602         cfs_rq->curr = NULL;
 603 }
 604
 605 static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 606 {
 607         /*
 608          * Update run-time statistics of the 'current'.
 609          */
 610         update_curr(cfs_rq);
 611
 612         if (cfs_rq->nr_running > 1)
 613                 check_preempt_tick(cfs_rq, curr);
 614 }
 615
 616 /**************************************************
 617  * CFS operations on tasks:
 618  */
 619
 620 #ifdef CONFIG_FAIR_GROUP_SCHED
 621
 622 /* Walk up scheduling entities hierarchy */
 623 #define for_each_sched_entity(se) \
 624                 for (; se; se = se->parent)
 625
 626 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 627 {
 628         return p->se.cfs_rq;
 629 }
 630
 631 /* runqueue on which this entity is (to be) queued */
 632 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 633 {
 634         return se->cfs_rq;
 635 }
 636
 637 /* runqueue "owned" by this group */
 638 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 639 {
 640         return grp->my_q;
 641 }
 642
 643 /* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on
 644  * another cpu ('this_cpu')
 645  */
 646 static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
 647 {
 648         return cfs_rq->tg->cfs_rq[this_cpu];
 649 }
 650
 651 /* Iterate thr' all leaf cfs_rq's on a runqueue */
 652 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 653         list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
 654
 655 /* Do the two (enqueued) entities belong to the same group ? */
 656 static inline int
 657 is_same_group(struct sched_entity *se, struct sched_entity *pse)
 658 {
 659         if (se->cfs_rq == pse->cfs_rq)
 660                 return 1;
 661
 662         return 0;
 663 }
 664
 665 static inline struct sched_entity *parent_entity(struct sched_entity *se)
 666 {
 667         return se->parent;
 668 }
 669
 670 #else   /* CONFIG_FAIR_GROUP_SCHED */
 671
 672 #define for_each_sched_entity(se) \
 673                 for (; se; se = NULL)
 674
 675 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 676 {
 677         return &task_rq(p)->cfs;
 678 }
 679
 680 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 681 {
 682         struct task_struct *p = task_of(se);
 683         struct rq *rq = task_rq(p);
 684
 685         return &rq->cfs;
 686 }
 687
 688 /* runqueue "owned" by this group */
 689 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 690 {
 691         return NULL;
 692 }
 693
 694 static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
 695 {
 696         return &cpu_rq(this_cpu)->cfs;
 697 }
 698
 699 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 700                 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
 701
 702 static inline int
 703 is_same_group(struct sched_entity *se, struct sched_entity *pse)
 704 {
 705         return 1;
 706 }
 707
 708 static inline struct sched_entity *parent_entity(struct sched_entity *se)
 709 {
 710         return NULL;
 711 }
 712
 713 #endif  /* CONFIG_FAIR_GROUP_SCHED */
 714
 715 /*
 716  * The enqueue_task method is called before nr_running is
 717  * increased. Here we update the fair scheduling stats and
 718  * then put the task into the rbtree:
 719  */
 720 static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
 721 {
 722         struct cfs_rq *cfs_rq;
 723         struct sched_entity *se = &p->se;
 724
 725         for_each_sched_entity(se) {
 726                 if (se->on_rq)
 727                         break;
 728                 cfs_rq = cfs_rq_of(se);
 729                 enqueue_entity(cfs_rq, se, wakeup);
 730         }
 731 }
 732
 733 /*
 734  * The dequeue_task method is called before nr_running is
 735  * decreased. We remove the task from the rbtree and
 736  * update the fair scheduling stats:
 737  */
 738 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
 739 {
 740         struct cfs_rq *cfs_rq;
 741         struct sched_entity *se = &p->se;
 742
 743         for_each_sched_entity(se) {
 744                 cfs_rq = cfs_rq_of(se);
 745                 dequeue_entity(cfs_rq, se, sleep);
 746                 /* Don't dequeue parent if it has other entities besides us */
 747                 if (cfs_rq->load.weight)
 748                         break;
 749         }
 750 }
 751
 752 /*
 753  * sched_yield() support is very simple - we dequeue and enqueue.
 754  *
 755  * If compat_yield is turned on then we requeue to the end of the tree.
 756  */
 757 static void yield_task_fair(struct rq *rq)
 758 {
 759         struct cfs_rq *cfs_rq = task_cfs_rq(rq->curr);
 760         struct sched_entity *rightmost, *se = &rq->curr->se;
 761
 762         /*
 763          * Are we the only task in the tree?
 764          */
 765         if (unlikely(cfs_rq->nr_running == 1))
 766                 return;
 767
 768         if (likely(!sysctl_sched_compat_yield)) {
 769                 __update_rq_clock(rq);
 770                 /*
 771                  * Dequeue and enqueue the task to update its
 772                  * position within the tree:
 773                  */
 774                 update_curr(cfs_rq);
 775
 776                 return;
 777         }
 778         /*
 779          * Find the rightmost entry in the rbtree:
 780          */
 781         rightmost = __pick_last_entity(cfs_rq);
 782         /*
 783          * Already in the rightmost position?
 784          */
 785         if (unlikely(rightmost->vruntime < se->vruntime))
 786                 return;
 787
 788         /*
 789          * Minimally necessary key value to be last in the tree:
 790          * Upon rescheduling, sched_class::put_prev_task() will place
 791          * 'current' within the tree based on its new key value.
 792          */
 793         se->vruntime = rightmost->vruntime + 1;
 794 }
 795
 796 /*
 797  * Preempt the current task with a newly woken task if needed:
 798  */
 799 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 800 {
 801         struct task_struct *curr = rq->curr;
 802         struct cfs_rq *cfs_rq = task_cfs_rq(curr);
 803         struct sched_entity *se = &curr->se, *pse = &p->se;
 804         s64 delta;
 805
 806         if (unlikely(rt_prio(p->prio))) {
 807                 update_rq_clock(rq);
 808                 update_curr(cfs_rq);
 809                 resched_task(curr);
 810                 return;
 811         }
 812
 813         while (!is_same_group(se, pse)) {
 814                 se = parent_entity(se);
 815                 pse = parent_entity(pse);
 816         }
 817
 818         delta = se->vruntime - pse->vruntime;
 819
 820         if (delta > (s64)sysctl_sched_wakeup_granularity)
 821                 resched_task(curr);
 822 }
 823
 824 static struct task_struct *pick_next_task_fair(struct rq *rq)
 825 {
 826         struct cfs_rq *cfs_rq = &rq->cfs;
 827         struct sched_entity *se;
 828
 829         if (unlikely(!cfs_rq->nr_running))
 830                 return NULL;
 831
 832         do {
 833                 se = pick_next_entity(cfs_rq);
 834                 cfs_rq = group_cfs_rq(se);
 835         } while (cfs_rq);
 836
 837         return task_of(se);
 838 }
 839
 840 /*
 841  * Account for a descheduled task:
 842  */
 843 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
 844 {
 845         struct sched_entity *se = &prev->se;
 846         struct cfs_rq *cfs_rq;
 847
 848         for_each_sched_entity(se) {
 849                 cfs_rq = cfs_rq_of(se);
 850                 put_prev_entity(cfs_rq, se);
 851         }
 852 }
 853
 854 /**************************************************
 855  * Fair scheduling class load-balancing methods:
 856  */
 857
 858 /*
 859  * Load-balancing iterator. Note: while the runqueue stays locked
 860  * during the whole iteration, the current task might be
 861  * dequeued so the iterator has to be dequeue-safe. Here we
 862  * achieve that by always pre-iterating before returning
 863  * the current task:
 864  */
 865 static inline struct task_struct *
 866 __load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr)
 867 {
 868         struct task_struct *p;
 869
 870         if (!curr)
 871                 return NULL;
 872
 873         p = rb_entry(curr, struct task_struct, se.run_node);
 874         cfs_rq->rb_load_balance_curr = rb_next(curr);
 875
 876         return p;
 877 }
 878
 879 static struct task_struct *load_balance_start_fair(void *arg)
 880 {
 881         struct cfs_rq *cfs_rq = arg;
 882
 883         return __load_balance_iterator(cfs_rq, first_fair(cfs_rq));
 884 }
 885
 886 static struct task_struct *load_balance_next_fair(void *arg)
 887 {
 888         struct cfs_rq *cfs_rq = arg;
 889
 890         return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
 891 }
 892
 893 #ifdef CONFIG_FAIR_GROUP_SCHED
 894 static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
 895 {
 896         struct sched_entity *curr;
 897         struct task_struct *p;
 898
 899         if (!cfs_rq->nr_running)
 900                 return MAX_PRIO;
 901
 902         curr = cfs_rq->curr;
 903         if (!curr)
 904                 curr = __pick_next_entity(cfs_rq);
 905
 906         p = task_of(curr);
 907
 908         return p->prio;
 909 }
 910 #endif
 911
 912 static unsigned long
 913 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 914                   unsigned long max_nr_move, unsigned long max_load_move,
 915                   struct sched_domain *sd, enum cpu_idle_type idle,
 916                   int *all_pinned, int *this_best_prio)
 917 {
 918         struct cfs_rq *busy_cfs_rq;
 919         unsigned long load_moved, total_nr_moved = 0, nr_moved;
 920         long rem_load_move = max_load_move;
 921         struct rq_iterator cfs_rq_iterator;
 922
 923         cfs_rq_iterator.start = load_balance_start_fair;
 924         cfs_rq_iterator.next = load_balance_next_fair;
 925
 926         for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
 927 #ifdef CONFIG_FAIR_GROUP_SCHED
 928                 struct cfs_rq *this_cfs_rq;
 929                 long imbalance;
 930                 unsigned long maxload;
 931
 932                 this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
 933
 934                 imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
 935                 /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
 936                 if (imbalance <= 0)
 937                         continue;
 938
 939                 /* Don't pull more than imbalance/2 */
 940                 imbalance /= 2;
 941                 maxload = min(rem_load_move, imbalance);
 942
 943                 *this_best_prio = cfs_rq_best_prio(this_cfs_rq);
 944 #else
 945 # define maxload rem_load_move
 946 #endif
 947                 /* pass busy_cfs_rq argument into
 948                  * load_balance_[start|next]_fair iterators
 949                  */
 950                 cfs_rq_iterator.arg = busy_cfs_rq;
 951                 nr_moved = balance_tasks(this_rq, this_cpu, busiest,
 952                                 max_nr_move, maxload, sd, idle, all_pinned,
 953                                 &load_moved, this_best_prio, &cfs_rq_iterator);
 954
 955                 total_nr_moved += nr_moved;
 956                 max_nr_move -= nr_moved;
 957                 rem_load_move -= load_moved;
 958
 959                 if (max_nr_move <= 0 || rem_load_move <= 0)
 960                         break;
 961         }
 962
 963         return max_load_move - rem_load_move;
 964 }
 965
 966 /*
 967  * scheduler tick hitting a task of our scheduling class:
 968  */
 969 static void task_tick_fair(struct rq *rq, struct task_struct *curr)
 970 {
 971         struct cfs_rq *cfs_rq;
 972         struct sched_entity *se = &curr->se;
 973
 974         for_each_sched_entity(se) {
 975                 cfs_rq = cfs_rq_of(se);
 976                 entity_tick(cfs_rq, se);
 977         }
 978 }
 979
 980 #define swap(a,b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0)
 981
 982 /*
 983  * Share the fairness runtime between parent and child, thus the
 984  * total amount of pressure for CPU stays equal - new tasks
 985  * get a chance to run but frequent forkers are not allowed to
 986  * monopolize the CPU. Note: the parent runqueue is locked,
 987  * the child is not running yet.
 988  */
 989 static void task_new_fair(struct rq *rq, struct task_struct *p)
 990 {
 991         struct cfs_rq *cfs_rq = task_cfs_rq(p);
 992         struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
 993
 994         sched_info_queued(p);
 995
 996         update_curr(cfs_rq);
 997         place_entity(cfs_rq, se, 1);
 998
 999         if (sysctl_sched_child_runs_first &&
1000                         curr->vruntime < se->vruntime) {
1001                 /*
1002                  * Upon rescheduling, sched_class::put_prev_task() will place
1003                  * 'current' within the tree based on its new key value.
1004                  */
1005                 swap(curr->vruntime, se->vruntime);
1006         }
1007
1008         update_stats_enqueue(cfs_rq, se);
1009         check_spread(cfs_rq, se);
1010         check_spread(cfs_rq, curr);
1011         __enqueue_entity(cfs_rq, se);
1012         account_entity_enqueue(cfs_rq, se);
1013         resched_task(rq->curr);
1014 }
1015
1016 /* Account for a task changing its policy or group.
1017  *
1018  * This routine is mostly called to set cfs_rq->curr field when a task
1019  * migrates between groups/classes.
1020  */
1021 static void set_curr_task_fair(struct rq *rq)
1022 {
1023         struct sched_entity *se = &rq->curr->se;
1024
1025         for_each_sched_entity(se)
1026                 set_next_entity(cfs_rq_of(se), se);
1027 }
1028
1029 /*
1030  * All the scheduling class methods:
1031  */
1032 struct sched_class fair_sched_class __read_mostly = {
1033         .enqueue_task           = enqueue_task_fair,
1034         .dequeue_task           = dequeue_task_fair,
1035         .yield_task             = yield_task_fair,
1036
1037         .check_preempt_curr     = check_preempt_wakeup,
1038
1039         .pick_next_task         = pick_next_task_fair,
1040         .put_prev_task          = put_prev_task_fair,
1041
1042         .load_balance           = load_balance_fair,
1043
1044         .set_curr_task          = set_curr_task_fair,
1045         .task_tick              = task_tick_fair,
1046         .task_new               = task_new_fair,
1047 };
1048
1049 #ifdef CONFIG_SCHED_DEBUG
1050 static void print_cfs_stats(struct seq_file *m, int cpu)
1051 {
1052         struct cfs_rq *cfs_rq;
1053
1054 #ifdef CONFIG_FAIR_GROUP_SCHED
1055         print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs);
1056 #endif
1057         for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
1058                 print_cfs_rq(m, cpu, cfs_rq);
1059 }
1060 #endif