2 * Xen time implementation.
4 * This is implemented in terms of a clocksource driver which uses
5 * the hypervisor clock as a nanosecond timebase, and a clockevent
6 * driver which uses the hypervisor's timer mechanism.
8 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
10 #include <linux/kernel.h>
11 #include <linux/interrupt.h>
12 #include <linux/clocksource.h>
13 #include <linux/clockchips.h>
14 #include <linux/kernel_stat.h>
16 #include <asm/xen/hypervisor.h>
17 #include <asm/xen/hypercall.h>
19 #include <xen/events.h>
20 #include <xen/interface/xen.h>
21 #include <xen/interface/vcpu.h>
27 /* Xen may fire a timer up to this many ns early */
28 #define TIMER_SLOP 100000
29 #define NS_PER_TICK (1000000000LL / HZ)
31 static cycle_t xen_clocksource_read(void);
33 /* These are perodically updated in shared_info, and then copied here. */
34 struct shadow_time_info {
35 u64 tsc_timestamp; /* TSC at last update of time vals. */
36 u64 system_timestamp; /* Time, in nanosecs, since boot. */
42 static DEFINE_PER_CPU(struct shadow_time_info, shadow_time);
44 /* runstate info updated by Xen */
45 static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
47 /* snapshots of runstate info */
48 static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot);
50 /* unused ns of stolen and blocked time */
51 static DEFINE_PER_CPU(u64, residual_stolen);
52 static DEFINE_PER_CPU(u64, residual_blocked);
54 /* return an consistent snapshot of 64-bit time/counter value */
55 static u64 get64(const u64 *p)
59 if (BITS_PER_LONG < 64) {
64 * Read high then low, and then make sure high is
65 * still the same; this will only loop if low wraps
66 * and carries into high.
67 * XXX some clean way to make this endian-proof?
74 } while (p32[1] != h);
76 ret = (((u64)h) << 32) | l;
86 static void get_runstate_snapshot(struct vcpu_runstate_info *res)
89 struct vcpu_runstate_info *state;
93 state = &__get_cpu_var(runstate);
96 * The runstate info is always updated by the hypervisor on
97 * the current CPU, so there's no need to use anything
98 * stronger than a compiler barrier when fetching it.
101 state_time = get64(&state->state_entry_time);
105 } while (get64(&state->state_entry_time) != state_time);
110 static void setup_runstate_info(int cpu)
112 struct vcpu_register_runstate_memory_area area;
114 area.addr.v = &per_cpu(runstate, cpu);
116 if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
121 static void do_stolen_accounting(void)
123 struct vcpu_runstate_info state;
124 struct vcpu_runstate_info *snap;
125 s64 blocked, runnable, offline, stolen;
128 get_runstate_snapshot(&state);
130 WARN_ON(state.state != RUNSTATE_running);
132 snap = &__get_cpu_var(runstate_snapshot);
134 /* work out how much time the VCPU has not been runn*ing* */
135 blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
136 runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable];
137 offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline];
141 /* Add the appropriate number of ticks of stolen time,
142 including any left-overs from last time. Passing NULL to
143 account_steal_time accounts the time as stolen. */
144 stolen = runnable + offline + __get_cpu_var(residual_stolen);
150 while (stolen >= NS_PER_TICK) {
152 stolen -= NS_PER_TICK;
154 __get_cpu_var(residual_stolen) = stolen;
155 account_steal_time(NULL, ticks);
157 /* Add the appropriate number of ticks of blocked time,
158 including any left-overs from last time. Passing idle to
159 account_steal_time accounts the time as idle/wait. */
160 blocked += __get_cpu_var(residual_blocked);
166 while (blocked >= NS_PER_TICK) {
168 blocked -= NS_PER_TICK;
170 __get_cpu_var(residual_blocked) = blocked;
171 account_steal_time(idle_task(smp_processor_id()), ticks);
175 * Xen sched_clock implementation. Returns the number of unstolen
176 * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
179 unsigned long long xen_sched_clock(void)
181 struct vcpu_runstate_info state;
182 cycle_t now = xen_clocksource_read();
185 get_runstate_snapshot(&state);
187 WARN_ON(state.state != RUNSTATE_running);
189 offset = now - state.state_entry_time;
193 return state.time[RUNSTATE_blocked] +
194 state.time[RUNSTATE_running] +
199 /* Get the CPU speed from Xen */
200 unsigned long xen_cpu_khz(void)
202 u64 cpu_khz = 1000000ULL << 32;
203 const struct vcpu_time_info *info =
204 &HYPERVISOR_shared_info->vcpu_info[0].time;
206 do_div(cpu_khz, info->tsc_to_system_mul);
207 if (info->tsc_shift < 0)
208 cpu_khz <<= -info->tsc_shift;
210 cpu_khz >>= info->tsc_shift;
216 * Reads a consistent set of time-base values from Xen, into a shadow data
219 static unsigned get_time_values_from_xen(void)
221 struct vcpu_time_info *src;
222 struct shadow_time_info *dst;
224 /* src is shared memory with the hypervisor, so we need to
225 make sure we get a consistent snapshot, even in the face of
227 src = &__get_cpu_var(xen_vcpu)->time;
228 dst = &__get_cpu_var(shadow_time);
231 dst->version = src->version;
232 rmb(); /* fetch version before data */
233 dst->tsc_timestamp = src->tsc_timestamp;
234 dst->system_timestamp = src->system_time;
235 dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
236 dst->tsc_shift = src->tsc_shift;
237 rmb(); /* test version after fetching data */
238 } while ((src->version & 1) | (dst->version ^ src->version));
244 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
245 * yielding a 64-bit result.
247 static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
268 : "=A" (product), "=r" (tmp1), "=r" (tmp2)
269 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
272 "mul %%rdx ; shrd $32,%%rdx,%%rax"
273 : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
281 static u64 get_nsec_offset(struct shadow_time_info *shadow)
284 now = native_read_tsc();
285 delta = now - shadow->tsc_timestamp;
286 return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift);
289 static cycle_t xen_clocksource_read(void)
291 struct shadow_time_info *shadow = &get_cpu_var(shadow_time);
296 version = get_time_values_from_xen();
298 ret = shadow->system_timestamp + get_nsec_offset(shadow);
300 } while (version != __get_cpu_var(xen_vcpu)->time.version);
302 put_cpu_var(shadow_time);
307 static void xen_read_wallclock(struct timespec *ts)
309 const struct shared_info *s = HYPERVISOR_shared_info;
314 /* get wallclock at system boot */
316 version = s->wc_version;
317 rmb(); /* fetch version before time */
318 now.tv_sec = s->wc_sec;
319 now.tv_nsec = s->wc_nsec;
320 rmb(); /* fetch time before checking version */
321 } while ((s->wc_version & 1) | (version ^ s->wc_version));
323 delta = xen_clocksource_read(); /* time since system boot */
324 delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec;
326 now.tv_nsec = do_div(delta, NSEC_PER_SEC);
329 set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
332 unsigned long xen_get_wallclock(void)
336 xen_read_wallclock(&ts);
341 int xen_set_wallclock(unsigned long now)
343 /* do nothing for domU */
347 static struct clocksource xen_clocksource __read_mostly = {
350 .read = xen_clocksource_read,
352 .mult = 1<<XEN_SHIFT, /* time directly in nanoseconds */
354 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
358 Xen clockevent implementation
360 Xen has two clockevent implementations:
362 The old timer_op one works with all released versions of Xen prior
363 to version 3.0.4. This version of the hypervisor provides a
364 single-shot timer with nanosecond resolution. However, sharing the
365 same event channel is a 100Hz tick which is delivered while the
366 vcpu is running. We don't care about or use this tick, but it will
367 cause the core time code to think the timer fired too soon, and
368 will end up resetting it each time. It could be filtered, but
369 doing so has complications when the ktime clocksource is not yet
370 the xen clocksource (ie, at boot time).
372 The new vcpu_op-based timer interface allows the tick timer period
373 to be changed or turned off. The tick timer is not useful as a
374 periodic timer because events are only delivered to running vcpus.
375 The one-shot timer can report when a timeout is in the past, so
376 set_next_event is capable of returning -ETIME when appropriate.
377 This interface is used when available.
382 Get a hypervisor absolute time. In theory we could maintain an
383 offset between the kernel's time and the hypervisor's time, and
384 apply that to a kernel's absolute timeout. Unfortunately the
385 hypervisor and kernel times can drift even if the kernel is using
386 the Xen clocksource, because ntp can warp the kernel's clocksource.
388 static s64 get_abs_timeout(unsigned long delta)
390 return xen_clocksource_read() + delta;
393 static void xen_timerop_set_mode(enum clock_event_mode mode,
394 struct clock_event_device *evt)
397 case CLOCK_EVT_MODE_PERIODIC:
402 case CLOCK_EVT_MODE_ONESHOT:
405 case CLOCK_EVT_MODE_UNUSED:
406 case CLOCK_EVT_MODE_SHUTDOWN:
407 HYPERVISOR_set_timer_op(0); /* cancel timeout */
412 static int xen_timerop_set_next_event(unsigned long delta,
413 struct clock_event_device *evt)
415 WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
417 if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
420 /* We may have missed the deadline, but there's no real way of
421 knowing for sure. If the event was in the past, then we'll
422 get an immediate interrupt. */
427 static const struct clock_event_device xen_timerop_clockevent = {
429 .features = CLOCK_EVT_FEAT_ONESHOT,
431 .max_delta_ns = 0xffffffff,
432 .min_delta_ns = TIMER_SLOP,
438 .set_mode = xen_timerop_set_mode,
439 .set_next_event = xen_timerop_set_next_event,
444 static void xen_vcpuop_set_mode(enum clock_event_mode mode,
445 struct clock_event_device *evt)
447 int cpu = smp_processor_id();
450 case CLOCK_EVT_MODE_PERIODIC:
451 WARN_ON(1); /* unsupported */
454 case CLOCK_EVT_MODE_ONESHOT:
455 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
459 case CLOCK_EVT_MODE_UNUSED:
460 case CLOCK_EVT_MODE_SHUTDOWN:
461 if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) ||
462 HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL))
468 static int xen_vcpuop_set_next_event(unsigned long delta,
469 struct clock_event_device *evt)
471 int cpu = smp_processor_id();
472 struct vcpu_set_singleshot_timer single;
475 WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
477 single.timeout_abs_ns = get_abs_timeout(delta);
478 single.flags = VCPU_SSHOTTMR_future;
480 ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single);
482 BUG_ON(ret != 0 && ret != -ETIME);
487 static const struct clock_event_device xen_vcpuop_clockevent = {
489 .features = CLOCK_EVT_FEAT_ONESHOT,
491 .max_delta_ns = 0xffffffff,
492 .min_delta_ns = TIMER_SLOP,
498 .set_mode = xen_vcpuop_set_mode,
499 .set_next_event = xen_vcpuop_set_next_event,
502 static const struct clock_event_device *xen_clockevent =
503 &xen_timerop_clockevent;
504 static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events);
506 static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
508 struct clock_event_device *evt = &__get_cpu_var(xen_clock_events);
512 if (evt->event_handler) {
513 evt->event_handler(evt);
517 do_stolen_accounting();
522 static void xen_setup_timer(int cpu)
525 struct clock_event_device *evt;
528 printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
530 name = kasprintf(GFP_KERNEL, "timer%d", cpu);
532 name = "<timer kasprintf failed>";
534 irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
535 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
538 evt = &get_cpu_var(xen_clock_events);
539 memcpy(evt, xen_clockevent, sizeof(*evt));
541 evt->cpumask = cpumask_of_cpu(cpu);
543 clockevents_register_device(evt);
545 setup_runstate_info(cpu);
547 put_cpu_var(xen_clock_events);
550 __init void xen_time_init(void)
552 int cpu = smp_processor_id();
554 get_time_values_from_xen();
556 clocksource_register(&xen_clocksource);
558 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
559 /* Successfully turned off 100Hz tick, so we have the
560 vcpuop-based timer interface */
561 printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
562 xen_clockevent = &xen_vcpuop_clockevent;
565 /* Set initial system time with full resolution */
566 xen_read_wallclock(&xtime);
567 set_normalized_timespec(&wall_to_monotonic,
568 -xtime.tv_sec, -xtime.tv_nsec);
572 xen_setup_timer(cpu);