err.no Git - linux-2.6/blob - arch/i386/xen/time.c

   1 /*
   2  * Xen time implementation.
   3  *
   4  * This is implemented in terms of a clocksource driver which uses
   5  * the hypervisor clock as a nanosecond timebase, and a clockevent
   6  * driver which uses the hypervisor's timer mechanism.
   7  *
   8  * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
   9  */
  10 #include <linux/kernel.h>
  11 #include <linux/interrupt.h>
  12 #include <linux/clocksource.h>
  13 #include <linux/clockchips.h>
  14 #include <linux/kernel_stat.h>
  15
  16 #include <asm/xen/hypervisor.h>
  17 #include <asm/xen/hypercall.h>
  18
  19 #include <xen/events.h>
  20 #include <xen/interface/xen.h>
  21 #include <xen/interface/vcpu.h>
  22
  23 #include "xen-ops.h"
  24
  25 #define XEN_SHIFT 22
  26
  27 /* Xen may fire a timer up to this many ns early */
  28 #define TIMER_SLOP      100000
  29 #define NS_PER_TICK     (1000000000LL / HZ)
  30
  31 /* These are perodically updated in shared_info, and then copied here. */
  32 struct shadow_time_info {
  33         u64 tsc_timestamp;     /* TSC at last update of time vals.  */
  34         u64 system_timestamp;  /* Time, in nanosecs, since boot.    */
  35         u32 tsc_to_nsec_mul;
  36         int tsc_shift;
  37         u32 version;
  38 };
  39
  40 static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
  41
  42 /* runstate info updated by Xen */
  43 static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
  44
  45 /* snapshots of runstate info */
  46 static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot);
  47
  48 /* unused ns of stolen and blocked time */
  49 static DEFINE_PER_CPU(u64, residual_stolen);
  50 static DEFINE_PER_CPU(u64, residual_blocked);
  51
  52 /* return an consistent snapshot of 64-bit time/counter value */
  53 static u64 get64(const u64 *p)
  54 {
  55         u64 ret;
  56
  57         if (BITS_PER_LONG < 64) {
  58                 u32 *p32 = (u32 *)p;
  59                 u32 h, l;
  60
  61                 /*
  62                  * Read high then low, and then make sure high is
  63                  * still the same; this will only loop if low wraps
  64                  * and carries into high.
  65                  * XXX some clean way to make this endian-proof?
  66                  */
  67                 do {
  68                         h = p32[1];
  69                         barrier();
  70                         l = p32[0];
  71                         barrier();
  72                 } while (p32[1] != h);
  73
  74                 ret = (((u64)h) << 32) | l;
  75         } else
  76                 ret = *p;
  77
  78         return ret;
  79 }
  80
  81 /*
  82  * Runstate accounting
  83  */
  84 static void get_runstate_snapshot(struct vcpu_runstate_info *res)
  85 {
  86         u64 state_time;
  87         struct vcpu_runstate_info *state;
  88
  89         preempt_disable();
  90
  91         state = &__get_cpu_var(runstate);
  92
  93         /*
  94          * The runstate info is always updated by the hypervisor on
  95          * the current CPU, so there's no need to use anything
  96          * stronger than a compiler barrier when fetching it.
  97          */
  98         do {
  99                 state_time = get64(&state->state_entry_time);
 100                 barrier();
 101                 *res = *state;
 102                 barrier();
 103         } while (get64(&state->state_entry_time) != state_time);
 104
 105         preempt_enable();
 106 }
 107
 108 static void setup_runstate_info(int cpu)
 109 {
 110         struct vcpu_register_runstate_memory_area area;
 111
 112         area.addr.v = &per_cpu(runstate, cpu);
 113
 114         if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
 115                                cpu, &area))
 116                 BUG();
 117 }
 118
 119 static void do_stolen_accounting(void)
 120 {
 121         struct vcpu_runstate_info state;
 122         struct vcpu_runstate_info *snap;
 123         s64 blocked, runnable, offline, stolen;
 124         cputime_t ticks;
 125
 126         get_runstate_snapshot(&state);
 127
 128         WARN_ON(state.state != RUNSTATE_running);
 129
 130         snap = &__get_cpu_var(runstate_snapshot);
 131
 132         /* work out how much time the VCPU has not been runn*ing*  */
 133         blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
 134         runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
 135         offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
 136
 137         *snap = state;
 138
 139         /* Add the appropriate number of ticks of stolen time,
 140            including any left-overs from last time.  Passing NULL to
 141            account_steal_time accounts the time as stolen. */
 142         stolen = runnable + offline + __get_cpu_var(residual_stolen);
 143
 144         if (stolen < 0)
 145                 stolen = 0;
 146
 147         ticks = 0;
 148         while (stolen >= NS_PER_TICK) {
 149                 ticks++;
 150                 stolen -= NS_PER_TICK;
 151         }
 152         __get_cpu_var(residual_stolen) = stolen;
 153         account_steal_time(NULL, ticks);
 154
 155         /* Add the appropriate number of ticks of blocked time,
 156            including any left-overs from last time.  Passing idle to
 157            account_steal_time accounts the time as idle/wait. */
 158         blocked += __get_cpu_var(residual_blocked);
 159
 160         if (blocked < 0)
 161                 blocked = 0;
 162
 163         ticks = 0;
 164         while (blocked >= NS_PER_TICK) {
 165                 ticks++;
 166                 blocked -= NS_PER_TICK;
 167         }
 168         __get_cpu_var(residual_blocked) = blocked;
 169         account_steal_time(idle_task(smp_processor_id()), ticks);
 170 }
 171
 172
 173
 174 /* Get the CPU speed from Xen */
 175 unsigned long xen_cpu_khz(void)
 176 {
 177         u64 cpu_khz = 1000000ULL << 32;
 178         const struct vcpu_time_info *info =
 179                 &HYPERVISOR_shared_info->vcpu_info[0].time;
 180
 181         do_div(cpu_khz, info->tsc_to_system_mul);
 182         if (info->tsc_shift < 0)
 183                 cpu_khz <<= -info->tsc_shift;
 184         else
 185                 cpu_khz >>= info->tsc_shift;
 186
 187         return cpu_khz;
 188 }
 189
 190 /*
 191  * Reads a consistent set of time-base values from Xen, into a shadow data
 192  * area.
 193  */
 194 static unsigned get_time_values_from_xen(void)
 195 {
 196         struct vcpu_time_info   *src;
 197         struct shadow_time_info *dst;
 198
 199         /* src is shared memory with the hypervisor, so we need to
 200            make sure we get a consistent snapshot, even in the face of
 201            being preempted. */
 202         src = &__get_cpu_var(xen_vcpu)->time;
 203         dst = &__get_cpu_var(shadow_time);
 204
 205         do {
 206                 dst->version = src->version;
 207                 rmb();          /* fetch version before data */
 208                 dst->tsc_timestamp     = src->tsc_timestamp;
 209                 dst->system_timestamp  = src->system_time;
 210                 dst->tsc_to_nsec_mul   = src->tsc_to_system_mul;
 211                 dst->tsc_shift         = src->tsc_shift;
 212                 rmb();          /* test version after fetching data */
 213         } while ((src->version & 1) | (dst->version ^ src->version));
 214
 215         return dst->version;
 216 }
 217
 218 /*
 219  * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
 220  * yielding a 64-bit result.
 221  */
 222 static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
 223 {
 224         u64 product;
 225 #ifdef __i386__
 226         u32 tmp1, tmp2;
 227 #endif
 228
 229         if (shift < 0)
 230                 delta >>= -shift;
 231         else
 232                 delta <<= shift;
 233
 234 #ifdef __i386__
 235         __asm__ (
 236                 "mul  %5       ; "
 237                 "mov  %4,%%eax ; "
 238                 "mov  %%edx,%4 ; "
 239                 "mul  %5       ; "
 240                 "xor  %5,%5    ; "
 241                 "add  %4,%%eax ; "
 242                 "adc  %5,%%edx ; "
 243                 : "=A" (product), "=r" (tmp1), "=r" (tmp2)
 244                 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
 245 #elif __x86_64__
 246         __asm__ (
 247                 "mul %%rdx ; shrd $32,%%rdx,%%rax"
 248                 : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
 249 #else
 250 #error implement me!
 251 #endif
 252
 253         return product;
 254 }
 255
 256 static u64 get_nsec_offset(struct shadow_time_info *shadow)
 257 {
 258         u64 now, delta;
 259         now = native_read_tsc();
 260         delta = now - shadow->tsc_timestamp;
 261         return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
 262 }
 263
 264 cycle_t xen_clocksource_read(void)
 265 {
 266         struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
 267         cycle_t ret;
 268         unsigned version;
 269
 270         do {
 271                 version = get_time_values_from_xen();
 272                 barrier();
 273                 ret = shadow->system_timestamp + get_nsec_offset(shadow);
 274                 barrier();
 275         } while (version != __get_cpu_var(xen_vcpu)->time.version);
 276
 277         put_cpu_var(shadow_time);
 278
 279         return ret;
 280 }
 281
 282 static void xen_read_wallclock(struct timespec *ts)
 283 {
 284         const struct shared_info *s = HYPERVISOR_shared_info;
 285         u32 version;
 286         u64 delta;
 287         struct timespec now;
 288
 289         /* get wallclock at system boot */
 290         do {
 291                 version = s->wc_version;
 292                 rmb();          /* fetch version before time */
 293                 now.tv_sec  = s->wc_sec;
 294                 now.tv_nsec = s->wc_nsec;
 295                 rmb();          /* fetch time before checking version */
 296         } while ((s->wc_version & 1) | (version ^ s->wc_version));
 297
 298         delta = xen_clocksource_read(); /* time since system boot */
 299         delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
 300
 301         now.tv_nsec = do_div(delta, NSEC_PER_SEC);
 302         now.tv_sec = delta;
 303
 304         set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
 305 }
 306
 307 unsigned long xen_get_wallclock(void)
 308 {
 309         struct timespec ts;
 310
 311         xen_read_wallclock(&ts);
 312
 313         return ts.tv_sec;
 314 }
 315
 316 int xen_set_wallclock(unsigned long now)
 317 {
 318         /* do nothing for domU */
 319         return -1;
 320 }
 321
 322 static struct clocksource xen_clocksource __read_mostly = {
 323         .name = "xen",
 324         .rating = 400,
 325         .read = xen_clocksource_read,
 326         .mask = ~0,
 327         .mult = 1<<XEN_SHIFT,           /* time directly in nanoseconds */
 328         .shift = XEN_SHIFT,
 329         .flags = CLOCK_SOURCE_IS_CONTINUOUS,
 330 };
 331
 332 /*
 333    Xen clockevent implementation
 334
 335    Xen has two clockevent implementations:
 336
 337    The old timer_op one works with all released versions of Xen prior
 338    to version 3.0.4.  This version of the hypervisor provides a
 339    single-shot timer with nanosecond resolution.  However, sharing the
 340    same event channel is a 100Hz tick which is delivered while the
 341    vcpu is running.  We don't care about or use this tick, but it will
 342    cause the core time code to think the timer fired too soon, and
 343    will end up resetting it each time.  It could be filtered, but
 344    doing so has complications when the ktime clocksource is not yet
 345    the xen clocksource (ie, at boot time).
 346
 347    The new vcpu_op-based timer interface allows the tick timer period
 348    to be changed or turned off.  The tick timer is not useful as a
 349    periodic timer because events are only delivered to running vcpus.
 350    The one-shot timer can report when a timeout is in the past, so
 351    set_next_event is capable of returning -ETIME when appropriate.
 352    This interface is used when available.
 353 */
 354
 355
 356 /*
 357   Get a hypervisor absolute time.  In theory we could maintain an
 358   offset between the kernel's time and the hypervisor's time, and
 359   apply that to a kernel's absolute timeout.  Unfortunately the
 360   hypervisor and kernel times can drift even if the kernel is using
 361   the Xen clocksource, because ntp can warp the kernel's clocksource.
 362 */
 363 static s64 get_abs_timeout(unsigned long delta)
 364 {
 365         return xen_clocksource_read() + delta;
 366 }
 367
 368 static void xen_timerop_set_mode(enum clock_event_mode mode,
 369                                  struct clock_event_device *evt)
 370 {
 371         switch (mode) {
 372         case CLOCK_EVT_MODE_PERIODIC:
 373                 /* unsupported */
 374                 WARN_ON(1);
 375                 break;
 376
 377         case CLOCK_EVT_MODE_ONESHOT:
 378                 break;
 379
 380         case CLOCK_EVT_MODE_UNUSED:
 381         case CLOCK_EVT_MODE_SHUTDOWN:
 382                 HYPERVISOR_set_timer_op(0);  /* cancel timeout */
 383                 break;
 384         }
 385 }
 386
 387 static int xen_timerop_set_next_event(unsigned long delta,
 388                                       struct clock_event_device *evt)
 389 {
 390         WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
 391
 392         if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
 393                 BUG();
 394
 395         /* We may have missed the deadline, but there's no real way of
 396            knowing for sure.  If the event was in the past, then we'll
 397            get an immediate interrupt. */
 398
 399         return 0;
 400 }
 401
 402 static const struct clock_event_device xen_timerop_clockevent = {
 403         .name = "xen",
 404         .features = CLOCK_EVT_FEAT_ONESHOT,
 405
 406         .max_delta_ns = 0xffffffff,
 407         .min_delta_ns = TIMER_SLOP,
 408
 409         .mult = 1,
 410         .shift = 0,
 411         .rating = 500,
 412
 413         .set_mode = xen_timerop_set_mode,
 414         .set_next_event = xen_timerop_set_next_event,
 415 };
 416
 417
 418
 419 static void xen_vcpuop_set_mode(enum clock_event_mode mode,
 420                                 struct clock_event_device *evt)
 421 {
 422         int cpu = smp_processor_id();
 423
 424         switch (mode) {
 425         case CLOCK_EVT_MODE_PERIODIC:
 426                 WARN_ON(1);     /* unsupported */
 427                 break;
 428
 429         case CLOCK_EVT_MODE_ONESHOT:
 430                 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
 431                         BUG();
 432                 break;
 433
 434         case CLOCK_EVT_MODE_UNUSED:
 435         case CLOCK_EVT_MODE_SHUTDOWN:
 436                 if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
 437                     HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
 438                         BUG();
 439                 break;
 440         }
 441 }
 442
 443 static int xen_vcpuop_set_next_event(unsigned long delta,
 444                                      struct clock_event_device *evt)
 445 {
 446         int cpu = smp_processor_id();
 447         struct vcpu_set_singleshot_timer single;
 448         int ret;
 449
 450         WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
 451
 452         single.timeout_abs_ns = get_abs_timeout(delta);
 453         single.flags = VCPU_SSHOTTMR_future;
 454
 455         ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single);
 456
 457         BUG_ON(ret != 0 && ret != -ETIME);
 458
 459         return ret;
 460 }
 461
 462 static const struct clock_event_device xen_vcpuop_clockevent = {
 463         .name = "xen",
 464         .features = CLOCK_EVT_FEAT_ONESHOT,
 465
 466         .max_delta_ns = 0xffffffff,
 467         .min_delta_ns = TIMER_SLOP,
 468
 469         .mult = 1,
 470         .shift = 0,
 471         .rating = 500,
 472
 473         .set_mode = xen_vcpuop_set_mode,
 474         .set_next_event = xen_vcpuop_set_next_event,
 475 };
 476
 477 static const struct clock_event_device *xen_clockevent =
 478         &xen_timerop_clockevent;
 479 static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events);
 480
 481 static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
 482 {
 483         struct clock_event_device *evt = &__get_cpu_var(xen_clock_events);
 484         irqreturn_t ret;
 485
 486         ret = IRQ_NONE;
 487         if (evt->event_handler) {
 488                 evt->event_handler(evt);
 489                 ret = IRQ_HANDLED;
 490         }
 491
 492         do_stolen_accounting();
 493
 494         return ret;
 495 }
 496
 497 static void xen_setup_timer(int cpu)
 498 {
 499         const char *name;
 500         struct clock_event_device *evt;
 501         int irq;
 502
 503         printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
 504
 505         name = kasprintf(GFP_KERNEL, "timer%d", cpu);
 506         if (!name)
 507                 name = "<timer kasprintf failed>";
 508
 509         irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
 510                                       IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
 511                                       name, NULL);
 512
 513         evt = &get_cpu_var(xen_clock_events);
 514         memcpy(evt, xen_clockevent, sizeof(*evt));
 515
 516         evt->cpumask = cpumask_of_cpu(cpu);
 517         evt->irq = irq;
 518         clockevents_register_device(evt);
 519
 520         setup_runstate_info(cpu);
 521
 522         put_cpu_var(xen_clock_events);
 523 }
 524
 525 __init void xen_time_init(void)
 526 {
 527         int cpu = smp_processor_id();
 528
 529         get_time_values_from_xen();
 530
 531         clocksource_register(&xen_clocksource);
 532
 533         if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
 534                 /* Successfully turned off 100Hz tick, so we have the
 535                    vcpuop-based timer interface */
 536                 printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
 537                 xen_clockevent = &xen_vcpuop_clockevent;
 538         }
 539
 540         /* Set initial system time with full resolution */
 541         xen_read_wallclock(&xtime);
 542         set_normalized_timespec(&wall_to_monotonic,
 543                                 -xtime.tv_sec, -xtime.tv_nsec);
 544
 545         tsc_disable = 0;
 546
 547         xen_setup_timer(cpu);
 548 }